Date: (Sat) May 30, 2015
Data: Source: Training: https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTrain.csv New: https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTest.csv
Time period:
Based on analysis utilizing <> techniques,
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(4) # max(length(glb_txt_vars), glb_n_cv_folds) + 1
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
glb_trnng_url <- "https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTrain.csv"
glb_newdt_url <- "https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTest.csv"
glb_out_pfx <- "NYTBlogs_cluster_"
glb_save_envir <- FALSE #TRUE or
glb_is_separate_newent_dataset <- TRUE # or TRUE
glb_split_entity_newent_datasets <- TRUE # or FALSE
glb_split_newdata_method <- "sample" # "condition" or "sample" or "copy"
glb_split_newdata_condition <- "<col_name> <condition_operator> <value>" # or NULL
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glb_drop_vars <- c(NULL) # or c("<col_name>")
#glb_max_fitent_obs <- 2238 # NULL # or any integer
glb_max_fitent_obs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- TRUE; glb_is_binomial <- TRUE
glb_rsp_var_raw <- "Popular"
# for classification, the response variable has to be a factor
glb_rsp_var <- "Popular.fctr"
# if the response factor is based on numbers e.g (0/1 vs. "A"/"B"),
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) {
relevel(factor(ifelse(raw == 1, "Y", "N")), as.factor(c("Y", "N")), ref="N")
#as.factor(paste0("B", raw))
#as.factor(raw)
}
glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0))
## [1] Y Y N N N
## Levels: N Y
glb_map_rsp_var_to_raw <- function(var) {
as.numeric(var) - 1
#as.numeric(var)
#levels(var)[as.numeric(var)]
#c(" <=50K", " >50K")[as.numeric(var)]
}
glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0)))
## [1] 1 1 0 0 0
if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".predict.") # model_id is appended later
# List info gathered for various columns
# <col_name>: <description>; <notes>
# NewsDesk = the New York Times desk that produced the story
# SectionName = the section the article appeared in (Opinion, Arts, Technology, etc.)
# SubsectionName = the subsection the article appeared in (Education, Small Business, Room for Debate, etc.)
# Headline = the title of the article
# Snippet = a small portion of the article text
# Abstract = a summary of the blog article, written by the New York Times
# WordCount = the number of words in the article
# created WordCount.log
# PubDate = the publication date, in the format "Year-Month-Day Hour:Minute:Second"
glb_date_vars <- c("PubDate")
# UniqueID = a unique identifier for each article
glb_id_vars <- c("UniqueID")
glb_is_textual <- TRUE # vs. glb_is_numerical ???
#Sys.setlocale("LC_ALL", "C") # For english
glb_txt_vars <- c("Headline", "Snippet", "Abstract")
glb_append_stop_words <- list() # NULL # or c("<freq_word>")
# Remember to use unstemmed words
glb_append_stop_words[["Headline"]] <- c(NULL
# ,"clip" # Highly correlated to H.P.daily.clip.report
# ,"springsummer" # Highly correlated to H.npnct14.log
)
glb_append_stop_words[["Snippet"]] <- c(NULL
# ,"herald" # Highly correlated to S.T.tribun
# ,"photo" # Highly correlated to A.T.photo
# ,"senate", "senator", "senatorial", "senators", "senatorselect" # Highly correlated to A.T.senat
# ,"archival", "archive", "archives", "archivist" # Highly correlated to H.P.year.colon
# ,"year", "years" # Highly correlated with A.T.year
# ,"appear", "appeared", "appearing", "appears" # Highly correlated with A.T.appear
)
glb_append_stop_words[["Abstract"]] <- c(NULL
# ,"archives", "articles", "diary", "first", "herald", "president", "share", "show", "tribune" # These are repeated in Snippet terms
# ,"highlight", "highlighted", "highlighting", "highlights" # Correlated with S.T.highlight
# ,"make", "makes" # correlated with S.T.make
# ,"week", "weekly", "weeks" # correlated with S.T.week
)
# Remember to use stemmed terms
glb_important_terms <- list()
# Properties:
# numrows(glb_feats_df) << numrows(glb_fitobs_df)
# Select terms that appear in at least 0.2 * O(FP/FN(glb_OOBobs_df))
# numrows(glb_OOBobs_df) = 1.1 * numrows(glb_newobs_df)
glb_sprs_thresholds <- c(0.988, 0.970, 0.970) # Generates 29, 22, 22 terms
#glb_sprs_thresholds <- c(0.990, 0.970, 0.970) # Generates 41, 22, 22 terms
#glb_sprs_thresholds <- c(0.985, 0.970, 0.970) # Generates 16, 22, 22 terms
#glb_sprs_thresholds <- c(0.975, 0.965, 0.965) # Generates 08, 14, 14 terms
#glb_sprs_thresholds <- c(0.982, 0.980, 0.980) # Generates 10, 61, 62 terms
names(glb_sprs_thresholds) <- glb_txt_vars
# List transformed vars
glb_exclude_vars_as_features <- c(NULL) # or c("<var_name>")
if (glb_is_textual)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_txt_vars)
if (glb_rsp_var_raw != glb_rsp_var)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_rsp_var_raw)
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(NULL)) # or c("<col_name>")
# List output vars (useful during testing in console)
# glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
# grep(glb_rsp_var_out, names(glb_trnobs_df), value=TRUE))
glb_impute_na_data <- TRUE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- TRUE# or FALSE
glb_models_lst <- list(); glb_models_df <- data.frame()
# rpart: .rnorm messes with the models badly
# caret creates dummy vars for factor feats which messes up the tuning
# - better to feed as.numeric(<feat>.fctr) to caret
# Regression
if (glb_is_regression)
glb_models_method_vctr <- c("lm", "glm", "rpart", "rf") else
# Classification
if (glb_is_binomial)
#glb_models_method_vctr <- c("glm", "rpart", "rf") else
#glb_models_method_vctr <- c("glm", "bayesglm", "rpart") else
glb_models_method_vctr <- c("glm", "rpart") else
glb_models_method_vctr <- c("rpart", "rf")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<col_name>")
glb_model_metric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glb_model_metric <- NULL # or "<metric_name>"
glb_model_metric_maximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glb_model_metric_smmry <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glb_model_metric_terms)
# metric <- sum(confusion_mtrx * glb_model_metric_terms) / nrow(data)
# names(metric) <- glb_model_metric
# return(metric)
# }
glb_tune_models_df <-
rbind(
#data.frame(parameter="cp", min=0.00005, max=0.00005, by=0.000005),
#seq(from=0.01, to=0.01, by=0.01)
#data.frame(parameter="mtry", min=080, max=100, by=10),
#data.frame(parameter="mtry", min=08, max=10, by=1),
data.frame(parameter="dummy", min=2, max=4, by=1)
)
# or NULL
glb_n_cv_folds <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glb_model_evl_criteria <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glb_model_evl_criteria <-
c("max.Accuracy.OOB", "max.auc.OOB", "max.Kappa.OOB", "min.aic.fit") else
glb_model_evl_criteria <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
glb_sel_mdl_id <- NULL # or "<model_id_prefix>.<model_method>"
glb_fin_mdl_id <- glb_sel_mdl_id # or "Final"
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 13.224 NA NA
1.0: import dataglb_trnobs_df <- myimport_data(url=glb_trnng_url, comment="glb_trnobs_df",
force_header=TRUE)
## [1] "Reading file ./data/NYTimesBlogTrain.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTrain.csv: 6,532 rows x 10 cols"
## NewsDesk SectionName SubsectionName
## 1 Business Crosswords/Games
## 2 Culture Arts
## 3 Business Business Day Dealbook
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 More School Daze
## 2 New 96-Page Murakami Work Coming in December
## 3 Public Pension Funds Stay Mum on Corporate Expats
## 4 Boot Camp for Bankers
## 5 Of Little Help to Older Knees
## 6 A Benefit of Legal Marijuana
## Snippet
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## Abstract
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## WordCount PubDate Popular UniqueID
## 1 508 2014-09-01 22:00:09 1 1
## 2 285 2014-09-01 21:14:07 0 2
## 3 1211 2014-09-01 21:05:36 0 3
## 4 1405 2014-09-01 20:43:34 1 4
## 5 181 2014-09-01 18:58:51 1 5
## 6 245 2014-09-01 18:52:22 1 6
## NewsDesk SectionName SubsectionName
## 226 Styles
## 995
## 3327
## 4753 Multimedia
## 4802 Business Crosswords/Games
## 6463 TStyle
## Headline
## 226 For Tavi Gevinson, Fashion Takes a Back Seat, for Now
## 995 Reconsidering What to Call an Extremist Group
## 3327 Clinton's Diagnosis of What's Wrong With Politics
## 4753 'Off Color' and on Target About Race in America
## 4802 Daniel Finkel's Circle-Toss Game
## 6463 Entering the Void
## Snippet
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## Abstract
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## WordCount PubDate Popular UniqueID
## 226 459 2014-09-04 16:55:57 0 226
## 995 301 2014-09-15 16:05:13 0 995
## 3327 236 2014-10-14 14:45:51 0 3327
## 4753 393 2014-11-02 05:00:13 0 4753
## 4802 1628 2014-11-03 12:00:04 1 4802
## 6463 264 2014-11-27 12:00:09 0 6463
## NewsDesk SectionName SubsectionName
## 6527 Foreign
## 6528 Opinion Room For Debate
## 6529 Foreign
## 6530 TStyle
## 6531 Multimedia
## 6532 Business
## Headline
## 6527 1914: Russians Dominate in East Poland
## 6528 Finding a Secretary of Defense
## 6529 1889: Metropolitan Opera House Reopens in New York
## 6530 The Daily Gift: Picasso Plates for Creative Dining
## 6531 Racing From New York to Barcelona
## 6532 Math Anxiety: Why Hollywood Makes Robots of Alan Turing and Other Geniuses
## Snippet
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## Abstract
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## WordCount PubDate Popular UniqueID
## 6527 176 2014-11-30 13:48:40 0 6527
## 6528 1597 2014-11-30 13:27:23 0 6528
## 6529 214 2014-11-30 09:44:57 0 6529
## 6530 61 2014-11-30 09:00:43 0 6530
## 6531 441 2014-11-30 09:00:22 0 6531
## 6532 921 2014-11-30 07:00:40 0 6532
## 'data.frame': 6532 obs. of 10 variables:
## $ NewsDesk : chr "Business" "Culture" "Business" "Business" ...
## $ SectionName : chr "Crosswords/Games" "Arts" "Business Day" "Business Day" ...
## $ SubsectionName: chr "" "" "Dealbook" "Dealbook" ...
## $ Headline : chr "More School Daze" "New 96-Page Murakami Work Coming in December" "Public Pension Funds Stay Mum on Corporate Expats" "Boot Camp for Bankers" ...
## $ Snippet : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ Abstract : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ WordCount : int 508 285 1211 1405 181 245 258 893 1077 188 ...
## $ PubDate : chr "2014-09-01 22:00:09" "2014-09-01 21:14:07" "2014-09-01 21:05:36" "2014-09-01 20:43:34" ...
## $ Popular : int 1 0 0 1 1 1 0 1 1 0 ...
## $ UniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "comment")= chr "glb_trnobs_df"
## NULL
if (glb_is_separate_newent_dataset) {
glb_newobs_df <- myimport_data(url=glb_newdt_url, comment="glb_newobs_df",
force_header=TRUE)
# To make plots / stats / checks easier in chunk:inspectORexplore.data
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df);
comment(glb_allobs_df) <- "glb_allobs_df"
} else {
glb_allobs_df <- glb_trnobs_df; comment(glb_allobs_df) <- "glb_allobs_df"
if (!glb_split_entity_newent_datasets) {
stop("Not implemented yet")
glb_newobs_df <- glb_trnobs_df[sample(1:nrow(glb_trnobs_df),
max(2, nrow(glb_trnobs_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=glb_split_newdata_condition)))
glb_trnobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newobs_df <- glb_trnobs_df[!split, ]
glb_trnobs_df <- glb_trnobs_df[split ,]
} else if (glb_split_newdata_method == "copy") {
glb_trnobs_df <- glb_allobs_df
comment(glb_trnobs_df) <- "glb_trnobs_df"
glb_newobs_df <- glb_allobs_df
comment(glb_newobs_df) <- "glb_newobs_df"
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample', 'copy')")
comment(glb_newobs_df) <- "glb_newobs_df"
myprint_df(glb_newobs_df)
str(glb_newobs_df)
if (glb_split_entity_newent_datasets) {
myprint_df(glb_trnobs_df)
str(glb_trnobs_df)
}
}
## [1] "Reading file ./data/NYTimesBlogTest.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTest.csv: 1,870 rows x 9 cols"
## NewsDesk SectionName SubsectionName
## 1 Culture
## 2 Culture Arts
## 3 Business Crosswords/Games
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 'Birdman' Tops the Gothams
## 2 'Sleepy Hollow' Recap: A Not-So-Shocking Death
## 3 Drinking Buddy For Falstaff
## 4 Encouraging Public Service, Through Wall Street's 'Revolving Door'
## 5 Therapy Prevents Repeat Suicide Attempts
## 6 Hoping for a Good Death
## Snippet
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## Abstract
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## WordCount PubDate UniqueID
## 1 111 2014-12-01 22:45:24 6533
## 2 558 2014-12-01 22:01:34 6534
## 3 788 2014-12-01 22:00:26 6535
## 4 915 2014-12-01 21:04:13 6536
## 5 213 2014-12-01 19:13:20 6537
## 6 938 2014-12-01 19:05:12 6538
## NewsDesk SectionName SubsectionName
## 3 Business Crosswords/Games
## 334 OpEd Opinion
## 725 TStyle
## 732 Business Business Day Dealbook
## 752 Business Business Day Dealbook
## 864
## Headline
## 3 Drinking Buddy For Falstaff
## 334 Facts & Figures: America’s Unique Take on Maternity Leave
## 725 Ansel Elgort Buttons Up in Brioni
## 732 A Shake-Up as the Financial World Infiltrates Philanthropy
## 752 Coupang, a South Korean E-Commerce Site, Raises $300 Million
## 864 Today in Politics
## Snippet
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## Abstract
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## WordCount PubDate UniqueID
## 3 788 2014-12-01 22:00:26 6535
## 334 160 2014-12-04 11:45:20 6866
## 725 89 2014-12-10 12:30:47 7257
## 732 1172 2014-12-10 12:00:38 7264
## 752 353 2014-12-10 08:30:41 7284
## 864 1544 2014-12-11 07:09:25 7396
## NewsDesk SectionName SubsectionName
## 1865
## 1866 Business Technology
## 1867 Metro N.Y. / Region
## 1868 Multimedia
## 1869 Foreign World Asia Pacific
## 1870 Science Health
## Headline
## 1865 Today in Politics
## 1866 Uber Suspends Operations in Spain
## 1867 New York Today: The Year in News
## 1868 New Year, Old Memories, in Times Square
## 1869 Hong Kong Police Criticized After 14-Year-Old's Detention
## 1870 The Super-Short Workout and Other Fitness Trends
## Snippet
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## Abstract
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## WordCount PubDate UniqueID
## 1865 1616 2014-12-31 07:03:46 8397
## 1866 292 2014-12-31 06:09:32 8398
## 1867 1010 2014-12-31 06:06:58 8399
## 1868 387 2014-12-31 05:00:19 8400
## 1869 717 2014-12-31 04:16:29 8401
## 1870 818 2014-12-31 00:01:10 8402
## 'data.frame': 1870 obs. of 9 variables:
## $ NewsDesk : chr "Culture" "Culture" "Business" "Business" ...
## $ SectionName : chr "" "Arts" "Crosswords/Games" "Business Day" ...
## $ SubsectionName: chr "" "" "" "Dealbook" ...
## $ Headline : chr "'Birdman' Tops the Gothams" "'Sleepy Hollow' Recap: A Not-So-Shocking Death" "Drinking Buddy For Falstaff" "Encouraging Public Service, Through Wall Street's 'Revolving Door'" ...
## $ Snippet : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ Abstract : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ WordCount : int 111 558 788 915 213 938 1336 2644 752 99 ...
## $ PubDate : chr "2014-12-01 22:45:24" "2014-12-01 22:01:34" "2014-12-01 22:00:26" "2014-12-01 21:04:13" ...
## $ UniqueID : int 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 ...
## - attr(*, "comment")= chr "glb_newobs_df"
## NULL
if (nrow(glb_trnobs_df) == nrow(glb_allobs_df))
warning("glb_trnobs_df same as glb_allobs_df")
if (nrow(glb_newobs_df) == nrow(glb_allobs_df))
warning("glb_newobs_df same as glb_allobs_df")
if (length(glb_drop_vars) > 0) {
warning("dropping vars: ", paste0(glb_drop_vars, collapse=", "))
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df), glb_drop_vars)]
glb_trnobs_df <- glb_trnobs_df[, setdiff(names(glb_trnobs_df), glb_drop_vars)]
glb_newobs_df <- glb_newobs_df[, setdiff(names(glb_newobs_df), glb_drop_vars)]
}
# Check for duplicates in glb_id_vars
if (length(glb_id_vars) == 0) {
warning("using .rownames as identifiers for observations")
glb_allobs_df$.rownames <- rownames(glb_allobs_df)
glb_id_vars <- ".rownames"
}
if (sum(duplicated(glb_allobs_df[, glb_id_vars, FALSE])) > 0)
stop(glb_id_vars, " duplicated in glb_allobs_df")
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_vars)
# Combine trnent & newent into glb_allobs_df for easier manipulation
glb_trnobs_df$.src <- "Train"; glb_newobs_df$.src <- "Test";
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, ".src")
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df)
comment(glb_allobs_df) <- "glb_allobs_df"
glb_trnobs_df <- glb_newobs_df <- NULL
glb_chunks_df <- myadd_chunk(glb_chunks_df, "inspect.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 13.224 14.257 1.033
## 2 inspect.data 2 0 14.258 NA NA
2.0: inspect data#print(str(glb_allobs_df))
#View(glb_allobs_df)
dsp_class_dstrb <- function(var) {
xtab_df <- mycreate_xtab_df(glb_allobs_df, c(".src", var))
rownames(xtab_df) <- xtab_df$.src
xtab_df <- subset(xtab_df, select=-.src)
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
dsp_problem_data <- function(df) {
print(sprintf("numeric data missing in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(is.na(df[, col]))))
print(sprintf("numeric data w/ 0s in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == 0, na.rm=TRUE)))
print(sprintf("numeric data w/ Infs in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == Inf, na.rm=TRUE)))
print(sprintf("numeric data w/ NaNs in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == NaN, na.rm=TRUE)))
print(sprintf("string data missing in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(myfind_chr_cols_df(df), ".src"),
function(col) sum(df[, col] == "")))
}
# Performed repeatedly in other chunks
glb_chk_data <- function() {
# Histogram of predictor in glb_trnobs_df & glb_newobs_df
print(myplot_histogram(glb_allobs_df, glb_rsp_var_raw) + facet_wrap(~ .src))
if (glb_is_classification)
dsp_class_dstrb(var=ifelse(glb_rsp_var %in% names(glb_allobs_df),
glb_rsp_var, glb_rsp_var_raw))
dsp_problem_data(glb_allobs_df)
}
glb_chk_data()
## Loading required package: reshape2
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## [1] "numeric data missing in glb_allobs_df: "
## WordCount Popular UniqueID
## 0 1870 0
## [1] "numeric data w/ 0s in glb_allobs_df: "
## WordCount Popular UniqueID
## 109 5439 0
## [1] "numeric data w/ Infs in glb_allobs_df: "
## WordCount Popular UniqueID
## 0 0 0
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## WordCount Popular UniqueID
## 0 0 0
## [1] "string data missing in glb_allobs_df: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
# Create new features that help diagnostics
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_allobs_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_allobs_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_allobs_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
if (glb_is_classification) dsp_class_dstrb(glb_rsp_var)
}
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## Popular Popular.fctr .n
## 1 0 N 5439
## 2 NA <NA> 1870
## 3 1 Y 1093
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (position_stack).
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
# Convert dates to numbers
# typically, dates come in as chars;
# so this must be done before converting chars to factors
myextract_dates_df <- function(df, vars, rsp_var) {
keep_feats <- c(NULL)
for (var in vars) {
dates_df <- data.frame(.date=strptime(df[, var], "%Y-%m-%d %H:%M:%S"))
dates_df[, rsp_var] <- df[, rsp_var]
dates_df[, paste0(var, ".POSIX")] <- dates_df$.date
dates_df[, paste0(var, ".year")] <- as.numeric(format(dates_df$.date, "%Y"))
dates_df[, paste0(var, ".year.fctr")] <- as.factor(format(dates_df$.date, "%Y"))
dates_df[, paste0(var, ".month")] <- as.numeric(format(dates_df$.date, "%m"))
dates_df[, paste0(var, ".month.fctr")] <- as.factor(format(dates_df$.date, "%m"))
dates_df[, paste0(var, ".date")] <- as.numeric(format(dates_df$.date, "%d"))
dates_df[, paste0(var, ".date.fctr")] <-
cut(as.numeric(format(dates_df$.date, "%d")), 5) # by month week
# wkday Sun=0; Mon=1; ...; Sat=6
dates_df[, paste0(var, ".wkday")] <- as.numeric(format(dates_df$.date, "%w"))
dates_df[, paste0(var, ".wkday.fctr")] <- as.factor(format(dates_df$.date, "%w"))
# Federal holidays 1.9., 13.10., 27.11., 25.12.
# NYState holidays 1.9., 13.10., 11.11., 27.11., 25.12.
months <- dates_df[, paste0(var, ".month")]
dates <- dates_df[, paste0(var, ".date")]
dates_df[, paste0(var, ".hlday")] <-
ifelse( ((months == 09) & (dates == 01)) |
((months == 10) & (dates == 13)) |
((months == 11) & (dates == 27)) |
((months == 12) & (dates == 25)) ,
1, 0)
dates_df[, paste0(var, ".wkend")] <- as.numeric(
(dates_df[, paste0(var, ".wkday")] %in% c(0, 6)) |
dates_df[, paste0(var, ".hlday")] )
dates_df[, paste0(var, ".hour")] <- as.numeric(format(dates_df$.date, "%H"))
dates_df[, paste0(var, ".hour.fctr")] <-
cut(as.numeric(format(dates_df$.date, "%H")), 3) # by work-shift
dates_df[, paste0(var, ".minute")] <- as.numeric(format(dates_df$.date, "%M"))
dates_df[, paste0(var, ".minute.fctr")] <-
cut(as.numeric(format(dates_df$.date, "%M")), 4) # by quarter-hours
dates_df[, paste0(var, ".second")] <- as.numeric(format(dates_df$.date, "%S"))
dates_df[, paste0(var, ".second.fctr")] <-
cut(as.numeric(format(dates_df$.date, "%S")), 4) # by quarter-hours
print(gp <- myplot_box(df=dates_df, ycol_names="PubDate.second",
xcol_name=rsp_var))
print(gp <- myplot_bar(df=dates_df, ycol_names="PubDate.second.fctr",
xcol_name=rsp_var, colorcol_name="PubDate.second.fctr"))
keep_feats <- union(keep_feats, paste(var,
c(".POSIX", ".year.fctr", ".month.fctr", ".date.fctr", ".wkday.fctr",
".wkend", ".hour.fctr", ".minute.fctr", ".second.fctr"), sep=""))
}
#myprint_df(dates_df)
return(dates_df[, keep_feats])
}
if (!is.null(glb_date_vars)) {
glb_allobs_df <- cbind(glb_allobs_df,
myextract_dates_df(df=glb_allobs_df, vars=glb_date_vars, rsp_var=glb_rsp_var))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
paste(glb_date_vars, c("", ".POSIX"), sep=""))
}
## Warning in mean.default(X[[1L]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[2L]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[1L]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[2L]], ...): argument is not numeric or logical:
## returning NA
srt_entity_df <- orderBy(~PubDate.POSIX, glb_allobs_df)
print(myplot_scatter(subset(srt_entity_df,
PubDate.POSIX < strptime("2014-09-02", "%Y-%m-%d")),
xcol_name="PubDate.POSIX", ycol_name=glb_rsp_var,
colorcol_name=glb_rsp_var
))
# Create features that measure the gap between previous timestamp in the data
require(zoo)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
pd = as.POSIXlt(srt_entity_df$PubDate)
z = zoo(as.numeric(pd))
srt_entity_df[, "PubDate.zoo"] <- z
print(head(srt_entity_df))
## NewsDesk SectionName SubsectionName
## 33 Science Health
## 32 Foreign World Asia Pacific
## 31 Multimedia
## 30 Culture Arts
## 29 Business Business Day Dealbook
## 28 Magazine Magazine
## Headline
## 33 Don't Catch What Ails Your House
## 32 Ukraine Conflict Has Been a Lift for China, Scholars Say
## 31 Revisiting Life and Death in Africa
## 30 Fabio Luisi Has a New Gig
## 29 Heineken to Sell Mexican Packaging Unit to Crown Holdings
## 28 Behind the Cover Story: Emily Bazelon on Medical Abortion Through the Mail
## Snippet
## 33 It doesnt take a flood to encourage the growth of mold in a home. A moist environment will do. A runny nose, coughing and all the rest typically follow.
## 32 As the United States and the European Union have imposed sanctions on Russia over the unrest in eastern Ukraine, China has been able to stand apart and gain concrete advantages, experts on foreign policy say.
## 31 Yunghi Kim went to Somalia 20 years ago expecting to cover a famine. She found herself instead in a war zone.
## 30 The music director of the Zurich Opera and principal conductor of the Metropolitan Opera will be named principal conductor of the Danish National Symphony Orchestra.
## 29 The deal values the container unit Empaque at about $1.2 billion and would make Crown Holdings the second-largest beverage can producer in North America.
## 28 Emily Bazelon, a contributing writer for the magazine, wrote this weeks cover story about the online distribution of medical abortions. Here she discusses reporting on a group of activists working to provide medical abortions through the mail.
## Abstract
## 33 It doesnt take a flood to encourage the growth of mold in a home. A moist environment will do. A runny nose, coughing and all the rest typically follow.
## 32 As the United States and the European Union have imposed sanctions on Russia over the unrest in eastern Ukraine, China has been able to stand apart and gain concrete advantages, experts on foreign policy say.
## 31 Yunghi Kim went to Somalia 20 years ago expecting to cover a famine. She found herself instead in a war zone.
## 30 The music director of the Zurich Opera and principal conductor of the Metropolitan Opera will be named principal conductor of the Danish National Symphony Orchestra.
## 29 The deal values the container unit Empaque at about $1.2 billion and would make Crown Holdings the second-largest beverage can producer in North America.
## 28 Emily Bazelon, a contributing writer for the magazine, wrote this weeks cover story about the online distribution of medical abortions. Here she discusses reporting on a group of activists working to provide medical abortions through the mail.
## WordCount PubDate Popular UniqueID .src Popular.fctr
## 33 962 2014-09-01 00:01:32 1 33 Train Y
## 32 529 2014-09-01 02:48:41 0 32 Train N
## 31 832 2014-09-01 03:00:15 0 31 Train N
## 30 166 2014-09-01 04:00:06 0 30 Train N
## 29 442 2014-09-01 04:11:20 0 29 Train N
## 28 1190 2014-09-01 05:00:26 0 28 Train N
## PubDate.POSIX PubDate.year.fctr PubDate.month.fctr
## 33 2014-09-01 00:01:32 2014 09
## 32 2014-09-01 02:48:41 2014 09
## 31 2014-09-01 03:00:15 2014 09
## 30 2014-09-01 04:00:06 2014 09
## 29 2014-09-01 04:11:20 2014 09
## 28 2014-09-01 05:00:26 2014 09
## PubDate.date.fctr PubDate.wkday.fctr PubDate.wkend PubDate.hour.fctr
## 33 (0.97,7] 1 1 (-0.023,7.67]
## 32 (0.97,7] 1 1 (-0.023,7.67]
## 31 (0.97,7] 1 1 (-0.023,7.67]
## 30 (0.97,7] 1 1 (-0.023,7.67]
## 29 (0.97,7] 1 1 (-0.023,7.67]
## 28 (0.97,7] 1 1 (-0.023,7.67]
## PubDate.minute.fctr PubDate.second.fctr PubDate.zoo
## 33 (-0.059,14.8] (29.5,44.2] 1409544092
## 32 (44.2,59.1] (29.5,44.2] 1409554121
## 31 (-0.059,14.8] (14.8,29.5] 1409554815
## 30 (-0.059,14.8] (-0.059,14.8] 1409558406
## 29 (-0.059,14.8] (14.8,29.5] 1409559080
## 28 (-0.059,14.8] (14.8,29.5] 1409562026
print(myplot_scatter(subset(srt_entity_df,
PubDate.POSIX < strptime("2014-09-02", "%Y-%m-%d")),
xcol_name="PubDate.zoo", ycol_name=glb_rsp_var,
colorcol_name=glb_rsp_var
))
## Don't know how to automatically pick scale for object of type zoo. Defaulting to continuous
n = nrow(srt_entity_df)
b = zoo(, seq(n))
last1 = as.numeric(merge(z-lag(z, -1), b, all = TRUE))
srt_entity_df[, "PubDate.last1"] <- last1
srt_entity_df[is.na(srt_entity_df$PubDate.last1), "PubDate.last1"] <- 0
srt_entity_df[, "PubDate.last1.log"] <- log(1 + srt_entity_df[, "PubDate.last1"])
print(gp <- myplot_box(df=subset(srt_entity_df, PubDate.last1.log > 0),
ycol_names="PubDate.last1.log",
xcol_name=glb_rsp_var))
last10 = as.numeric(merge(z-lag(z, -10), b, all = TRUE))
srt_entity_df[, "PubDate.last10"] <- last10
srt_entity_df[is.na(srt_entity_df$PubDate.last10), "PubDate.last10"] <- 0
srt_entity_df[, "PubDate.last10.log"] <- log(1 + srt_entity_df[, "PubDate.last10"])
print(gp <- myplot_box(df=subset(srt_entity_df, PubDate.last10.log > 0),
ycol_names="PubDate.last10.log",
xcol_name=glb_rsp_var))
last100 = as.numeric(merge(z-lag(z, -100), b, all = TRUE))
srt_entity_df[, "PubDate.last100"] <- last100
srt_entity_df[is.na(srt_entity_df$PubDate.last100), "PubDate.last100"] <- 0
srt_entity_df[, "PubDate.last100.log"] <- log(1 + srt_entity_df[, "PubDate.last100"])
print(gp <- myplot_box(df=subset(srt_entity_df, PubDate.last100.log > 0),
ycol_names="PubDate.last100.log",
xcol_name=glb_rsp_var))
glb_allobs_df <- srt_entity_df
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c("PubDate.zoo", "PubDate.last1", "PubDate.last10", "PubDate.last100"))
# all2$last3 = as.numeric(merge(z-lag(z, -3), b, all = TRUE))
# all2$last5 = as.numeric(merge(z-lag(z, -5), b, all = TRUE))
# all2$last10 = as.numeric(merge(z-lag(z, -10), b, all = TRUE))
# all2$last20 = as.numeric(merge(z-lag(z, -20), b, all = TRUE))
# all2$last50 = as.numeric(merge(z-lag(z, -50), b, all = TRUE))
#
#
# # order table
# all2 = all2[order(all2$id),]
#
# ## fill in NAs
# # count averages
# na.avg = all2 %>% group_by(weekend, hour) %>% dplyr::summarise(
# last1=mean(last1, na.rm=TRUE),
# last3=mean(last3, na.rm=TRUE),
# last5=mean(last5, na.rm=TRUE),
# last10=mean(last10, na.rm=TRUE),
# last20=mean(last20, na.rm=TRUE),
# last50=mean(last50, na.rm=TRUE)
# )
#
# # fill in averages
# na.merge = merge(all2, na.avg, by=c("weekend","hour"))
# na.merge = na.merge[order(na.merge$id),]
# for(i in c("last1", "last3", "last5", "last10", "last20", "last50")) {
# y = paste0(i, ".y")
# idx = is.na(all2[[i]])
# all2[idx,][[i]] <- na.merge[idx,][[y]]
# }
# rm(na.avg, na.merge, b, i, idx, n, pd, sec, sh, y, z)
# check distribution of all numeric data
dsp_numeric_vars_dstrb <- function(vars_lst) {
for (var in vars_lst) {
print(sprintf("var: %s", var))
gp <- myplot_box(df=glb_allobs_df, ycol_names=var, xcol_name=glb_rsp_var)
if (inherits(glb_allobs_df[, var], "factor"))
gp <- gp + facet_wrap(reformulate(var))
print(gp)
}
}
# dsp_numeric_vars_dstrb(setdiff(names(glb_allobs_df),
# union(myfind_chr_cols_df(glb_allobs_df),
# c(glb_rsp_var_raw, glb_rsp_var))))
add_new_diag_feats <- function(obs_df, ref_df=glb_allobs_df) {
require(plyr)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>.log=log(1 + <col.name>),
WordCount.log = log(1 + WordCount),
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
.rnorm=rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newobs_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
#print(summary(obs_df))
#print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
# Add WordCount.log since WordCount is not distributed normally
glb_allobs_df <- add_new_diag_feats(glb_allobs_df)
## Loading required package: plyr
print("Replacing WordCount with WordCount.log in potential feature set")
## [1] "Replacing WordCount with WordCount.log in potential feature set"
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, "WordCount")
# Remove PubDate.year since all entity data is from 2014
# Remove PubDate.month.fctr since all newent data is from December
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c("PubDate.year", "PubDate.month.fctr"))
# Check distributions of newly transformed / extracted vars
# Enhancement: remove vars that were displayed ealier
dsp_numeric_vars_dstrb(setdiff(names(glb_allobs_df),
union(myfind_chr_cols_df(glb_allobs_df),
union(glb_rsp_var_raw,
union(glb_rsp_var, glb_exclude_vars_as_features)))))
## [1] "var: PubDate.year.fctr"
## [1] "var: PubDate.date.fctr"
## [1] "var: PubDate.wkday.fctr"
## [1] "var: PubDate.wkend"
## [1] "var: PubDate.hour.fctr"
## [1] "var: PubDate.minute.fctr"
## [1] "var: PubDate.second.fctr"
## [1] "var: PubDate.last1.log"
## [1] "var: PubDate.last10.log"
## [1] "var: PubDate.last100.log"
## [1] "var: WordCount.log"
## [1] "var: .rnorm"
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
#pairs(subset(glb_trnobs_df, select=-c(col_symbol)))
# Check for glb_newobs_df & glb_trnobs_df features range mismatches
# Other diagnostics:
# print(subset(glb_trnobs_df, <col1_name> == max(glb_trnobs_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_trnobs_df$<col1_name>, na.rm=TRUE)))
# print(glb_trnobs_df[which.max(glb_trnobs_df$<col_name>),])
# print(<col_name>_freq_glb_trnobs_df <- mycreate_tbl_df(glb_trnobs_df, "<col_name>"))
# print(which.min(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>)[, 2]))
# print(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>))
# print(table(is.na(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(table(sign(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(mycreate_xtab_df(glb_trnobs_df, <col1_name>))
# print(mycreate_xtab_df(glb_trnobs_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mycreate_xtab_df(glb_trnobs_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_trnobs_df[is.na(<col1_name>_<col2_name>_xtab_glb_trnobs_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_trnobs_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>.NA, glb_trnobs_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_trnobs_df, Symbol %in% c("KO", "PG")),
# "Date.my", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.Date("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.Date("1983-01-01")))
# )
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>") +
# geom_point(data=subset(glb_allobs_df, <condition>),
# mapping=aes(x=<x_var>, y=<y_var>), color="red", shape=4, size=5))
rm(srt_entity_df, last1, last10, last100, pd)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "cleanse.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 2 inspect.data 2 0 14.258 33.04 18.782
## 3 cleanse.data 2 1 33.040 NA NA
2.1: cleanse data# Options:
# 1. Not fill missing vars
# 2. Fill missing numerics with a different algorithm
# 3. Fill missing chars with data based on clusters
dsp_problem_data(glb_allobs_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 1870 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 378
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 7624 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 11
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 11 10 10
## PubDate.last100 PubDate.last100.log WordCount.log
## 100 100 109
## .rnorm
## 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
warning("Forcing ", nrow(subset(glb_allobs_df, WordCount.log == 0)),
" obs with WordCount.log 0s to NA")
## Warning: Forcing 109 obs with WordCount.log 0s to NA
glb_allobs_df[glb_allobs_df$WordCount.log == 0, "WordCount.log"] <- NA
dsp_problem_data(glb_allobs_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 1870 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 109
## .rnorm
## 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 378
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 7624 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 11
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 11 10 10
## PubDate.last100 PubDate.last100.log WordCount.log
## 100 100 0
## .rnorm
## 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
dsp_catgs <- function() {
print("NewsDesk:")
print(table(glb_allobs_df$NewsDesk))
print("SectionName:")
print(table(glb_allobs_df$SectionName))
print("SubsectionName:")
print(table(glb_allobs_df$SubsectionName))
}
sel_obs <- function(Popular=NULL,
NewsDesk=NULL, SectionName=NULL, SubsectionName=NULL,
Headline.contains=NULL, Snippet.contains=NULL, Abstract.contains=NULL,
Headline.pfx=NULL, NewsDesk.nb=NULL, .clusterid=NULL, myCategory=NULL,
perl=FALSE) {
tmp_entity_df <- glb_allobs_df
# Does not work for Popular == NAs ???
if (!is.null(Popular)) {
if (is.na(Popular))
tmp_entity_df <- tmp_entity_df[is.na(tmp_entity_df$Popular), ] else
tmp_entity_df <- tmp_entity_df[tmp_entity_df$Popular == Popular, ]
}
if (!is.null(NewsDesk))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$NewsDesk == NewsDesk, ]
if (!is.null(SectionName))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$SectionName == SectionName, ]
if (!is.null(SubsectionName))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$SubsectionName == SubsectionName, ]
if (!is.null(Headline.contains))
tmp_entity_df <-
tmp_entity_df[grep(Headline.contains, tmp_entity_df$Headline, perl=perl), ]
if (!is.null(Snippet.contains))
tmp_entity_df <-
tmp_entity_df[grep(Snippet.contains, tmp_entity_df$Snippet, perl=perl), ]
if (!is.null(Abstract.contains))
tmp_entity_df <-
tmp_entity_df[grep(Abstract.contains, tmp_entity_df$Abstract, perl=perl), ]
if (!is.null(Headline.pfx)) {
if (length(grep("Headline.pfx", names(tmp_entity_df), fixed=TRUE, value=TRUE))
> 0) tmp_entity_df <-
tmp_entity_df[tmp_entity_df$Headline.pfx == Headline.pfx, ] else
warning("glb_allobs_df does not contain Headline.pfx; ignoring that filter")
}
if (!is.null(NewsDesk.nb)) {
if (any(grepl("NewsDesk.nb", names(tmp_entity_df), fixed=TRUE)) > 0)
tmp_entity_df <-
tmp_entity_df[tmp_entity_df$NewsDesk.nb == NewsDesk.nb, ] else
warning("glb_allobs_df does not contain NewsDesk.nb; ignoring that filter")
}
if (!is.null(.clusterid)) {
if (any(grepl(".clusterid", names(tmp_entity_df), fixed=TRUE)) > 0)
tmp_entity_df <-
tmp_entity_df[tmp_entity_df$.clusterid == .clusterid, ] else
warning("glb_allobs_df does not contain .clusterid; ignoring that filter")
}
if (!is.null(myCategory)) {
if (any(grepl("myCategory", names(tmp_entity_df), fixed=TRUE)) > 0)
tmp_entity_df <-
tmp_entity_df[tmp_entity_df$myCategory == myCategory, ] else
warning("glb_allobs_df does not contain myCategory; ignoring that filter")
}
return(glb_allobs_df$UniqueID %in% tmp_entity_df$UniqueID)
}
dsp_obs <- function(..., cols=c(NULL), all=FALSE) {
tmp_df <- glb_allobs_df[sel_obs(...),
union(c("UniqueID", "Popular", "myCategory", "Headline"), cols), FALSE]
if(all) { print(tmp_df) } else { myprint_df(tmp_df) }
}
#dsp_obs(Popular=1, NewsDesk="", SectionName="", Headline.contains="Boehner")
# dsp_obs(Popular=1, NewsDesk="", SectionName="")
# dsp_obs(Popular=NA, NewsDesk="", SectionName="")
dsp_tbl <- function(...) {
tmp_entity_df <- glb_allobs_df[sel_obs(...), ]
tmp_tbl <- table(tmp_entity_df$NewsDesk,
tmp_entity_df$SectionName,
tmp_entity_df$SubsectionName,
tmp_entity_df$Popular, useNA="ifany")
#print(names(tmp_tbl))
#print(dimnames(tmp_tbl))
print(tmp_tbl)
}
dsp_hdlxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "Headline", glb_rsp_var)))
#dsp_hdlxtab("(1914)|(1939)")
dsp_catxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# dsp_catxtab("1914)|(1939)")
# dsp_catxtab("19(14|39|64):")
# dsp_catxtab("19..:")
# Create myCategory <- NewsDesk#SectionName#SubsectionName
# Fix some data before merging categories
glb_allobs_df[sel_obs(Headline.contains="Your Turn:", NewsDesk=""),
"NewsDesk"] <- "Styles"
glb_allobs_df[sel_obs(Headline.contains="School", NewsDesk="", SectionName="U.S.",
SubsectionName=""),
"SubsectionName"] <- "Education"
glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
"SectionName"] <- "Business Day"
glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
"SubsectionName"] <- "Small Business"
glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
"SectionName"] <- "Opinion"
glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
"SubsectionName"] <- "Room For Debate"
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName="", Popular=NA),
# "SubsectionName"] <- "Small Business"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(7973),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName=""),
# "SectionName"] <- "Technology"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5076, 5736, 5924, 5911, 6532),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(SectionName="Health"),
# "NewsDesk"] <- "Science"
# glb_allobs_df[sel_obs(SectionName="Travel"),
# "NewsDesk"] <- "Travel"
#
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SectionName"] <- ""
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SubsectionName"] <- ""
# glb_allobs_df[sel_obs(NewsDesk="Styles", SectionName="", SubsectionName="", Popular=1),
# "SectionName"] <- "U.S."
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5486),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
glb_allobs_df$myCategory <- paste(glb_allobs_df$NewsDesk,
glb_allobs_df$SectionName,
glb_allobs_df$SubsectionName,
sep="#")
dsp_obs( Headline.contains="Music:"
#,NewsDesk=""
#,SectionName=""
#,SubsectionName="Fashion & Style"
#,Popular=1 #NA
,cols= c("UniqueID", "Headline", "Popular", "myCategory",
"NewsDesk", "SectionName", "SubsectionName"),
all=TRUE)
## UniqueID Popular myCategory
## 305 305 0 OpEd#Opinion#
## 844 844 1 OpEd#Opinion#
## 1331 1331 0 OpEd#Opinion#
## 1974 1974 0 OpEd#Opinion#
## 2563 2563 0 OpEd#Opinion#
## 3091 3091 0 OpEd#Opinion#
## 3589 3589 0 OpEd#Opinion#
## 4631 4631 0 OpEd#Opinion#
## 5125 5125 0 OpEd#Opinion#
## 5630 5630 0 OpEd#Opinion#
## 6095 6095 0 OpEd#Opinion#
## 6513 6513 1 OpEd#Opinion#
## 6927 6927 NA OpEd#Opinion#
## 7473 7473 NA #Opinion#
## 7931 7931 NA OpEd#Opinion#
## 8217 8217 NA OpEd#Opinion#
## Headline NewsDesk
## 305 Friday Night Music: Lucius Covers John Lennon OpEd
## 844 Friday Night Music: Cheryl Wheeler OpEd
## 1331 Friday Night Music: Cheryl Wheeler, Summer Fly OpEd
## 1974 Friday Night Music: Quilt OpEd
## 2563 Friday Night Music: Lucius in Asheville OpEd
## 3091 Friday Night Music: Sarah Jarosz and the Milk Carton Kids OpEd
## 3589 Friday Night Music: Lucius Covers the Kinks OpEd
## 4631 Friday Night Music: Amason OpEd
## 5125 Friday Night Music: Suzanne Vega, Jacob and the Angel OpEd
## 5630 Friday Night Music: Suzanne Vega, I Never Wear White OpEd
## 6095 Friday Night Music: Jessica Hernandez and the Deltas OpEd
## 6513 Saturday Morning Music: Stay Gold OpEd
## 6927 Friday Night Music: Lucius, Monsters OpEd
## 7473 Friday Night Music: Peter Gabriel, 1993
## 7931 Friday Night Music: The Roches, Winter Wonderland OpEd
## 8217 Friday Night Music: Sarah Jarosz and Aoife O'Donovan OpEd
## SectionName SubsectionName
## 305 Opinion
## 844 Opinion
## 1331 Opinion
## 1974 Opinion
## 2563 Opinion
## 3091 Opinion
## 3589 Opinion
## 4631 Opinion
## 5125 Opinion
## 5630 Opinion
## 6095 Opinion
## 6513 Opinion
## 6927 Opinion
## 7473 Opinion
## 7931 Opinion
## 8217 Opinion
dsp_obs( Headline.contains="."
,NewsDesk=""
,SectionName="Opinion"
,SubsectionName=""
#,Popular=1 #NA
,cols= c("UniqueID", "Headline", "Popular", "myCategory",
"NewsDesk", "SectionName", "SubsectionName"),
all=TRUE)
## UniqueID Popular myCategory
## 516 516 0 #Opinion#
## 918 918 0 #Opinion#
## 7473 7473 NA #Opinion#
## 7445 7445 NA #Opinion#
## 7419 7419 NA #Opinion#
## 7505 7505 NA #Opinion#
## 7509 7509 NA #Opinion#
## Headline
## 516 This Is Life Among the Roma, Europes Forgotten People
## 918 What Might Happen If Iran Becomes America's Covert Ally?
## 7473 Friday Night Music: Peter Gabriel, 1993
## 7445 Senate Committee Bothered to Authorize War Against Islamic State
## 7419 Joe on WNYCs Money Talking
## 7505 Rev. Dr. William Barber II on Todays Protest Movements
## 7509 Did Salaita Cross the Line of Civility?
## NewsDesk SectionName SubsectionName
## 516 Opinion
## 918 Opinion
## 7473 Opinion
## 7445 Opinion
## 7419 Opinion
## 7505 Opinion
## 7509 Opinion
# Merge some categories
glb_allobs_df$myCategory <-
plyr::revalue(glb_allobs_df$myCategory, c(
"#Business Day#Dealbook" = "Business#Business Day#Dealbook",
"#Business Day#Small Business" = "Business#Business Day#Small Business",
"#Crosswords/Games#" = "Business#Crosswords/Games#",
"Business##" = "Business#Technology#",
"#Open#" = "Business#Technology#",
"#Technology#" = "Business#Technology#",
"#Arts#" = "Culture#Arts#",
"Culture##" = "Culture#Arts#",
"#World#Asia Pacific" = "Foreign#World#Asia Pacific",
"Foreign##" = "Foreign#World#",
"#N.Y. / Region#" = "Metro#N.Y. / Region#",
"#Opinion#" = "OpEd#Opinion#",
"OpEd##" = "OpEd#Opinion#",
"#Health#" = "Science#Health#",
"Science##" = "Science#Health#",
"Styles##" = "Styles##Fashion",
"Styles#Health#" = "Science#Health#",
"Styles#Style#Fashion & Style" = "Styles##Fashion",
"#Travel#" = "Travel#Travel#",
"Magazine#Magazine#" = "myOther",
"National##" = "myOther",
"National#U.S.#Politics" = "myOther",
"Sports##" = "myOther",
"Sports#Sports#" = "myOther",
"#U.S.#" = "myOther",
# "Business##Small Business" = "Business#Business Day#Small Business",
#
# "#Opinion#" = "#Opinion#Room For Debate",
"##" = "##"
# "Business##" = "Business#Business Day#Dealbook",
# "Foreign#World#" = "Foreign##",
# "#Open#" = "Other",
# "#Opinion#The Public Editor" = "OpEd#Opinion#",
# "Styles#Health#" = "Styles##",
# "Styles#Style#Fashion & Style" = "Styles##",
# "#U.S.#" = "#U.S.#Education",
))
ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
mycreate_sqlxtab_df(glb_allobs_df,
c("myCategory", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# myprint_df(ctgry_xtab_df)
# write.table(ctgry_xtab_df, paste0(glb_out_pfx, "ctgry_xtab.csv"),
# row.names=FALSE)
ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
myCategory + NewsDesk + SectionName + SubsectionName ~
Popular.fctr, sum, value.var=".n"))
myprint_df(ctgry_cast_df)
## myCategory NewsDesk SectionName SubsectionName
## 33 OpEd#Opinion# OpEd Opinion
## 36 Science#Health# Science Health
## 1 ##
## 11 Business#Crosswords/Games# Business Crosswords/Games
## 40 Styles#U.S.# Styles U.S.
## 7 Business#Business Day#Dealbook Business Business Day Dealbook
## N Y NA
## 33 113 407 141
## 36 73 119 55
## 1 1163 110 338
## 11 19 103 38
## 40 77 100 62
## 7 864 88 291
## myCategory NewsDesk SectionName
## 35 Science#Health# Science
## 17 Culture#Arts# Culture
## 16 Culture#Arts# Arts
## 8 Business#Business Day#Small Business Business Day
## 13 Business#Technology# Technology
## 28 myOther National U.S.
## SubsectionName N Y NA
## 35 0 2 2
## 17 1 0 70
## 16 0 0 11
## 8 Small Business 1 0 4
## 13 0 0 1
## 28 Politics 2 0 0
## myCategory NewsDesk SectionName SubsectionName N Y NA
## 27 myOther National 2 0 0
## 28 myOther National U.S. Politics 2 0 0
## 29 myOther Sports 1 0 0
## 30 myOther Sports Sports 1 0 0
## 37 Science#Health# Styles Health 1 0 0
## 39 Styles##Fashion Styles Style Fashion & Style 2 0 0
write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_cast.csv"),
row.names=FALSE)
print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df[, glb_rsp_var],
useNA="ifany"))
##
## N Y <NA>
## ## 1163 110 338
## #Multimedia# 139 2 52
## #Opinion#Room For Debate 69 7 24
## #Opinion#The Public Editor 4 16 10
## #U.S.#Education 325 0 90
## Business#Business Day#Dealbook 864 88 304
## Business#Business Day#Small Business 135 5 42
## Business#Crosswords/Games# 20 103 42
## Business#Technology# 288 51 113
## Culture#Arts# 626 50 244
## Foreign#World# 172 0 47
## Foreign#World#Asia Pacific 200 3 56
## Metro#N.Y. / Region# 181 17 67
## myOther 38 0 3
## OpEd#Opinion# 115 408 164
## Science#Health# 74 122 57
## Styles##Fashion 118 1 15
## Styles#U.S.# 77 100 62
## Travel#Travel# 116 1 35
## TStyle## 715 9 105
dsp_chisq.test <- function(...) {
sel_df <- glb_allobs_df[sel_obs(...) &
!is.na(glb_allobs_df$Popular), ]
sel_df$.marker <- 1
ref_df <- glb_allobs_df[!is.na(glb_allobs_df$Popular), ]
mrg_df <- merge(ref_df[, c(glb_id_vars, "Popular")],
sel_df[, c(glb_id_vars, ".marker")], all.x=TRUE)
mrg_df[is.na(mrg_df)] <- 0
print(mrg_tbl <- table(mrg_df$.marker, mrg_df$Popular))
print("Rows:Selected; Cols:Popular")
#print(mrg_tbl)
print(chisq.test(mrg_tbl))
}
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test(Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola"), ],
# c(glb_rsp_var, "NewsDesk", "SectionName", "SubsectionName")))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName))
# print(table(glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c("NewsDesk", "SectionName", "SubsectionName"))
# Copy Headline into Snipper & Abstract if they are empty
print(glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, c("Headline", "Snippet")])
## Headline
## 2838 First Draft Focus: Off to Raise Money for Democrats
## 3728 Verbatim: Obama as Supreme Court Justice?
## 4904 Election 2014: Live Coverage
## 4994 Election 2014: Live Coverage
## 5065 First Draft Focus: Honoring a Civil War Hero
## 5029 First Draft Focus: Perry's Day in Court
## 5160 Supreme Court to Hear New Health Law Challenge
## 5254 Verbatim: Will Rick Perry Run?
## 5472 First Draft Focus: A Red Carpet Welcome
## 7164 Does Torture Work? C.I.A.'s Claims vs. Senate Panel's Findings
## 7129 First Draft Focus: Pass a Bill
## 7368 Verbatim: The People's Priorities
## 7364 First Draft Focus: Three Wise Men
## Snippet
## 2838
## 3728
## 4904
## 4994
## 5065
## 5029
## 5160
## 5254
## 5472
## 7164
## 7129
## 7368
## 7364
print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Snippet,
c("UniqueID", "Headline", "Snippet")])
## [1] UniqueID Headline Snippet
## <0 rows> (or 0-length row.names)
glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Snippet"] <-
glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Headline"]
print(glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, c("Headline", "Abstract")])
## Headline
## 2838 First Draft Focus: Off to Raise Money for Democrats
## 3728 Verbatim: Obama as Supreme Court Justice?
## 4904 Election 2014: Live Coverage
## 4994 Election 2014: Live Coverage
## 5065 First Draft Focus: Honoring a Civil War Hero
## 5029 First Draft Focus: Perry's Day in Court
## 5160 Supreme Court to Hear New Health Law Challenge
## 5254 Verbatim: Will Rick Perry Run?
## 5472 First Draft Focus: A Red Carpet Welcome
## 7164 Does Torture Work? C.I.A.'s Claims vs. Senate Panel's Findings
## 7129 First Draft Focus: Pass a Bill
## 7368 Verbatim: The People's Priorities
## 7364 First Draft Focus: Three Wise Men
## 7329 Obama Works the Phones to Get Funding Deal Done
## 7315 House Democrats Vent Frustration With White House
## 7310 Funding Bill Hangs in Balance as House Votes
## 7309 Spending Bill Passes House With Democratic Support
## Abstract
## 2838
## 3728
## 4904
## 4994
## 5065
## 5029
## 5160
## 5254
## 5472
## 7164
## 7129
## 7368
## 7364
## 7329
## 7315
## 7310
## 7309
print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Abstract,
c("UniqueID", "Headline", "Abstract")])
## [1] UniqueID Headline Abstract
## <0 rows> (or 0-length row.names)
glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Abstract"] <-
glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Headline"]
# WordCount_0_df <- subset(glb_allobs_df, WordCount == 0)
# table(WordCount_0_df$Popular, WordCount_0_df$WordCount, useNA="ifany")
# myprint_df(WordCount_0_df[,
# c("UniqueID", "Popular", "WordCount", "Headline")])
glb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 3 cleanse.data 2 1 33.040 36.982 3.942
## 4 manage.missing.data 2 2 36.983 NA NA
2.2: manage missing data# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# glb_trnobs_df <- na.omit(glb_trnobs_df)
# glb_newobs_df <- na.omit(glb_newobs_df)
# df[is.na(df)] <- 0
dsp_problem_data(glb_allobs_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 1870 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 109
## .rnorm
## 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 378
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 7624 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 11
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 11 10 10
## PubDate.last100 PubDate.last100.log WordCount.log
## 100 100 0
## .rnorm
## 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2407 2883 6156 0 0
## Abstract PubDate myCategory
## 0 0 0
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function() {
require(mice)
set.seed(glb_mice_complete.seed)
inp_impent_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
union(glb_exclude_vars_as_features, glb_rsp_var))]
print("Summary before imputation: ")
print(summary(inp_impent_df))
out_impent_df <- complete(mice(inp_impent_df))
print(summary(out_impent_df))
return(out_impent_df[, "WordCount.log"])
}
if (glb_impute_na_data)
glb_allobs_df[, "WordCount.log"] <- glb_impute_missing_data()
## Loading required package: mice
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
## [1] "Summary before imputation: "
## PubDate.year.fctr PubDate.date.fctr PubDate.wkday.fctr PubDate.wkend
## 2014:8402 (0.97,7]:1981 0: 378 Min. :0.0000
## (7,13] :1757 1:1605 1st Qu.:0.0000
## (13,19] :1808 2:1559 Median :0.0000
## (19,25] :1650 3:1614 Mean :0.0926
## (25,31] :1206 4:1539 3rd Qu.:0.0000
## 5:1470 Max. :1.0000
## 6: 237
## PubDate.hour.fctr PubDate.minute.fctr PubDate.second.fctr
## (-0.023,7.67]:1610 (-0.059,14.8]:3119 (-0.059,14.8]:2134
## (7.67,15.3] :4484 (14.8,29.5] :1671 (14.8,29.5] :2063
## (15.3,23] :2308 (29.5,44.2] :1995 (29.5,44.2] :2112
## (44.2,59.1] :1617 (44.2,59.1] :2093
##
##
##
## PubDate.last1.log PubDate.last10.log PubDate.last100.log WordCount.log
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.6932
## 1st Qu.: 5.263 1st Qu.: 8.516 1st Qu.:11.37 1st Qu.:5.2679
## Median : 6.292 Median : 8.868 Median :11.43 Median :5.9480
## Mean : 6.094 Mean : 9.048 Mean :11.49 Mean :5.8263
## 3rd Qu.: 7.126 3rd Qu.: 9.424 3rd Qu.:11.78 3rd Qu.:6.6067
## Max. :10.875 Max. :11.744 Max. :12.95 Max. :9.2977
## NA's :109
## .rnorm myCategory
## Min. :-3.281785 Length:8402
## 1st Qu.:-0.681275 Class :character
## Median : 0.007735 Mode :character
## Mean :-0.000264
## 3rd Qu.: 0.673409
## Max. : 3.987726
##
##
## iter imp variable
## 1 1 WordCount.log
## 1 2 WordCount.log
## 1 3 WordCount.log
## 1 4 WordCount.log
## 1 5 WordCount.log
## 2 1 WordCount.log
## 2 2 WordCount.log
## 2 3 WordCount.log
## 2 4 WordCount.log
## 2 5 WordCount.log
## 3 1 WordCount.log
## 3 2 WordCount.log
## 3 3 WordCount.log
## 3 4 WordCount.log
## 3 5 WordCount.log
## 4 1 WordCount.log
## 4 2 WordCount.log
## 4 3 WordCount.log
## 4 4 WordCount.log
## 4 5 WordCount.log
## 5 1 WordCount.log
## 5 2 WordCount.log
## 5 3 WordCount.log
## 5 4 WordCount.log
## 5 5 WordCount.log
## PubDate.year.fctr PubDate.date.fctr PubDate.wkday.fctr PubDate.wkend
## 2014:8402 (0.97,7]:1981 0: 378 Min. :0.0000
## (7,13] :1757 1:1605 1st Qu.:0.0000
## (13,19] :1808 2:1559 Median :0.0000
## (19,25] :1650 3:1614 Mean :0.0926
## (25,31] :1206 4:1539 3rd Qu.:0.0000
## 5:1470 Max. :1.0000
## 6: 237
## PubDate.hour.fctr PubDate.minute.fctr PubDate.second.fctr
## (-0.023,7.67]:1610 (-0.059,14.8]:3119 (-0.059,14.8]:2134
## (7.67,15.3] :4484 (14.8,29.5] :1671 (14.8,29.5] :2063
## (15.3,23] :2308 (29.5,44.2] :1995 (29.5,44.2] :2112
## (44.2,59.1] :1617 (44.2,59.1] :2093
##
##
##
## PubDate.last1.log PubDate.last10.log PubDate.last100.log WordCount.log
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.6931
## 1st Qu.: 5.263 1st Qu.: 8.516 1st Qu.:11.37 1st Qu.:5.2730
## Median : 6.292 Median : 8.868 Median :11.43 Median :5.9480
## Mean : 6.094 Mean : 9.048 Mean :11.49 Mean :5.8267
## 3rd Qu.: 7.126 3rd Qu.: 9.424 3rd Qu.:11.78 3rd Qu.:6.6067
## Max. :10.875 Max. :11.744 Max. :12.95 Max. :9.2977
##
## .rnorm myCategory
## Min. :-3.281785 Length:8402
## 1st Qu.:-0.681275 Class :character
## Median : 0.007735 Mode :character
## Mean :-0.000264
## 3rd Qu.: 0.673409
## Max. : 3.987726
##
dsp_problem_data(glb_allobs_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 1870 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 378
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 7624 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 11
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 11 10 10
## PubDate.last100 PubDate.last100.log WordCount.log
## 100 100 0
## .rnorm
## 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.POSIX PubDate.year.fctr
## 0 0 0
## PubDate.month.fctr PubDate.date.fctr PubDate.wkday.fctr
## 0 0 0
## PubDate.wkend PubDate.hour.fctr PubDate.minute.fctr
## 0 0 0
## PubDate.second.fctr PubDate.zoo PubDate.last1
## 0 0 0
## PubDate.last1.log PubDate.last10 PubDate.last10.log
## 0 0 0
## PubDate.last100 PubDate.last100.log WordCount.log
## 0 0 0
## .rnorm
## 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2407 2883 6156 0 0
## Abstract PubDate myCategory
## 0 0 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "encode.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 4 manage.missing.data 2 2 36.983 42.439 5.456
## 5 encode.data 2 3 42.440 NA NA
2.3: encode data# map_<col_name>_df <- myimport_data(
# url="<map_url>",
# comment="map_<col_name>_df", print_diagn=TRUE)
# map_<col_name>_df <- read.csv(paste0(getwd(), "/data/<file_name>.csv"), strip.white=TRUE)
# glb_trnobs_df <- mymap_codes(glb_trnobs_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_newobs_df <- mymap_codes(glb_newobs_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_trnobs_df$<col_name>.fctr <- factor(glb_trnobs_df$<col_name>,
# as.factor(union(glb_trnobs_df$<col_name>, glb_newobs_df$<col_name>)))
# glb_newobs_df$<col_name>.fctr <- factor(glb_newobs_df$<col_name>,
# as.factor(union(glb_trnobs_df$<col_name>, glb_newobs_df$<col_name>)))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 5 encode.data 2 3 42.440 42.468 0.028
## 6 extract.features 3 0 42.469 NA NA
3.0: extract features#```{r extract_features, cache=FALSE, eval=glb_is_textual}
extract.features_chunk_df <- myadd_chunk(NULL, "extract.features_bgn")
## label step_major step_minor bgn end elapsed
## 1 extract.features_bgn 1 0 42.515 NA NA
# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_trnobs_df$<col_name>), -2, na.pad=TRUE)
# glb_trnobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newobs_df$<col_name>), -2, na.pad=TRUE)
# glb_newobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newobs_df[1, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df) - 1,
# "<col_name>"]
# glb_newobs_df[2, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df),
# "<col_name>"]
# glb_allobs_df <- mutate(glb_allobs_df,
# A.P.http=ifelse(grepl("http",Added,fixed=TRUE), 1, 0)
# )
#
# glb_trnobs_df <- mutate(glb_trnobs_df,
# )
#
# glb_newobs_df <- mutate(glb_newobs_df,
# )
# Create factors of string variables
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "factorize.str.vars"), major.inc=TRUE)
## label step_major step_minor bgn end
## 1 extract.features_bgn 1 0 42.515 42.524
## 2 extract.features_factorize.str.vars 2 0 42.525 NA
## elapsed
## 1 0.009
## 2 NA
print(str_vars <- myfind_chr_cols_df(glb_allobs_df))
## NewsDesk SectionName SubsectionName Headline
## "NewsDesk" "SectionName" "SubsectionName" "Headline"
## Snippet Abstract PubDate .src
## "Snippet" "Abstract" "PubDate" ".src"
## myCategory
## "myCategory"
if (length(str_vars <- setdiff(str_vars,
glb_exclude_vars_as_features)) > 0) {
for (var in str_vars) {
warning("Creating factors of string variable: ", var,
": # of unique values: ", length(unique(glb_allobs_df[, var])))
glb_allobs_df[, paste0(var, ".fctr")] <- factor(glb_allobs_df[, var],
as.factor(unique(glb_allobs_df[, var])))
# glb_trnobs_df[, paste0(var, ".fctr")] <- factor(glb_trnobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
# glb_newobs_df[, paste0(var, ".fctr")] <- factor(glb_newobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
}
## Warning: Creating factors of string variable: myCategory: # of unique
## values: 20
if (glb_is_textual) {
require(foreach)
require(gsubfn)
require(stringr)
require(tm)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text"), major.inc=TRUE)
chk_pattern_freq <- function(re_str, ignore.case=TRUE) {
match_mtrx <- str_extract_all(txt_vctr, regex(re_str, ignore_case=ignore.case),
simplify=TRUE)
match_df <- as.data.frame(match_mtrx[match_mtrx != ""])
names(match_df) <- "pattern"
return(mycreate_sqlxtab_df(match_df, "pattern"))
}
#tmp_freq_df <- chk_pattern_freq("\\bNew (\\w)+", ignore.case=FALSE)
#subset(chk_pattern_freq("\\bNew (\\w)+", ignore.case=FALSE), grepl("New [[:upper:]]", pattern))
#chk_pattern_freq("\\bnew (\\W)+")
chk_subfn <- function(pos_ix) {
re_str <- gsubfn_args_lst[["re_str"]][[pos_ix]]
print("re_str:"); print(re_str)
rp_frmla <- gsubfn_args_lst[["rp_frmla"]][[pos_ix]]
print("rp_frmla:"); print(rp_frmla, showEnv=FALSE)
tmp_vctr <- grep(re_str, txt_vctr, value=TRUE, ignore.case=TRUE)[1:5]
print("Before:")
print(tmp_vctr)
print("After:")
print(gsubfn(re_str, rp_frmla, tmp_vctr, ignore.case=TRUE))
}
#chk_subfn(1)
myapply_gsub <- function(...) {
if ((length_lst <- length(names(gsub_map_lst))) == 0)
return(txt_vctr)
for (ptn_ix in 1:length_lst) {
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
length(names(gsub_map_lst)), names(gsub_map_lst)[ptn_ix]))
txt_vctr <- gsub(names(gsub_map_lst)[ptn_ix], gsub_map_lst[[ptn_ix]],
txt_vctr, ...)
}
return(txt_vctr)
}
myapply_txtmap <- function(txt_vctr, ...) {
nrows <- nrow(glb_txt_map_df)
for (ptn_ix in 1:nrows) {
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
nrows, glb_txt_map_df[ptn_ix, "rex_str"]))
txt_vctr <- gsub(glb_txt_map_df[ptn_ix, "rex_str"],
glb_txt_map_df[ptn_ix, "rpl_str"],
txt_vctr, ...)
}
return(txt_vctr)
}
chk.equal <- function(bgn, end) {
print(all.equal(sav_txt_lst[["Headline"]][bgn:end], glb_txt_lst[["Headline"]][bgn:end]))
}
dsp.equal <- function(bgn, end) {
print(sav_txt_lst[["Headline"]][bgn:end])
print(glb_txt_lst[["Headline"]][bgn:end])
}
#sav_txt_lst <- glb_txt_lst; all.equal(sav_txt_lst, glb_txt_lst)
#all.equal(sav_txt_lst[["Headline"]][1:4200], glb_txt_lst[["Headline"]][1:4200])
#all.equal(sav_txt_lst[["Headline"]][1:2000], glb_txt_lst[["Headline"]][1:2000])
#all.equal(sav_txt_lst[["Headline"]][1:1000], glb_txt_lst[["Headline"]][1:1000])
#all.equal(sav_txt_lst[["Headline"]][1:500], glb_txt_lst[["Headline"]][1:500])
#all.equal(sav_txt_lst[["Headline"]][1:200], glb_txt_lst[["Headline"]][1:200])
#all.equal(sav_txt_lst[["Headline"]][1:100], glb_txt_lst[["Headline"]][1:100])
#chk.equal( 1, 100)
#chk.equal(51, 100)
#chk.equal(81, 100)
#chk.equal(81, 90)
#chk.equal(81, 85)
#chk.equal(86, 90)
#chk.equal(96, 100)
#dsp.equal(86, 90)
glb_txt_map_df <- read.csv("mytxt_map.csv", comment.char="#", strip.white=TRUE)
glb_txt_lst <- list();
print(sprintf("Building glb_txt_lst..."))
glb_txt_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_vctr <- glb_allobs_df[, txt_var]
# myapply_txtmap shd be created as a tm_map::content_transformer ?
#print(glb_txt_map_df)
#txt_var=glb_txt_vars[3]; txt_vctr <- glb_txt_lst[[txt_var]]
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rex_str == "\\bWall St\\.", "rex_str"])
#print(rex_str <- glb_txt_map_df[grepl("du Pont", glb_txt_map_df$rex_str), "rex_str"])
#print(tmp_vctr <- grep(rex_str, txt_vctr, value=TRUE, ignore.case=FALSE))
#ret_lst <- regexec(rex_str, txt_vctr, ignore.case=FALSE); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
#gsub(rex_str, glb_txt_map_df[glb_txt_map_df$rex_str == rex_str, "rpl_str"], tmp_vctr, ignore.case=FALSE)
#grep("Hong Hong", txt_vctr, value=TRUE)
txt_vctr <- myapply_txtmap(txt_vctr, ignore.case=FALSE)
}
names(glb_txt_lst) <- glb_txt_vars
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining Acronyms in %s:", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(tmp_vctr <- grep("[[:upper:]]\\.", txt_vctr, value=TRUE, ignore.case=FALSE))
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl("( |-)[[:upper:]]", pattern))))
print(" consider cleaning if relevant to problem domain; geography name; .n > 1")
#grep("New G", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Wins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(N|S|E|W|C)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("N Weaver", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(North|South|East|West|Central)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("Central (African|Bankers|Cast|Italy|Role|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("East (Africa|Berlin|London|Poland|Rivals|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("North (American|Korean|West)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("South (Pacific|Street)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Martins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
find_cmpnd_wrds <- function(txt_vctr) {
txt_corpus <- Corpus(VectorSource(txt_vctr))
txt_corpus <- tm_map(txt_corpus, tolower)
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation,
preserve_intra_word_dashes=TRUE)
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full, full_Tf_df)
cmpnd_Tf_df <- full_Tf_df[grep("-", full_Tf_df$term, value=TRUE) ,]
filter_df <- read.csv("mytxt_compound.csv", comment.char="#", strip.white=TRUE)
cmpnd_Tf_df$filter <- FALSE
for (row_ix in 1:nrow(filter_df))
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "filter"] <-
grepl(filter_df[row_ix, "rex_str"],
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "term"], ignore.case=TRUE)
cmpnd_Tf_df <- subset(cmpnd_Tf_df, !filter)
# Bug in tm_map(txt_corpus, removePunctuation, preserve_intra_word_dashes=TRUE) ???
# "net-a-porter" gets converted to "net-aporter"
#grep("net-a-porter", txt_vctr, ignore.case=TRUE, value=TRUE)
#grep("maser-laser", txt_vctr, ignore.case=TRUE, value=TRUE)
#txt_corpus[[which(grepl("net-a-porter", txt_vctr, ignore.case=TRUE))]]
#grep("\\b(across|longer)-(\\w)", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
#grep("(\\w)-(affected|term)\\b", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
print(sprintf("nrow(cmpnd_Tf_df): %d", nrow(cmpnd_Tf_df)))
myprint_df(cmpnd_Tf_df)
}
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining compound terms in %s: ", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
find_cmpnd_wrds(txt_vctr)
#grep("thirty-five", txt_vctr, ignore.case=TRUE, value=TRUE)
#rex_str <- glb_txt_map_df[grepl("hirty", glb_txt_map_df$rex_str), "rex_str"]
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "build.corpus"), major.inc=TRUE)
glb_corpus_lst <- list()
print(sprintf("Building glb_corpus_lst..."))
glb_corpus_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_corpus <- Corpus(VectorSource(glb_txt_lst[[txt_var]]))
txt_corpus <- tm_map(txt_corpus, tolower) #nuppr
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation) #npnct<chr_ix>
# txt-corpus <- tm_map(txt_corpus, content_transformer(function(x, pattern) gsub(pattern, "", x))
# Not to be run in production
inspect_terms <- function() {
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full +term, full_Tf_df)
print(myplot_histogram(full_Tf_df, "Tf.full"))
myprint_df(full_Tf_df)
#txt_corpus[[which(grepl("zun", txt_vctr, ignore.case=TRUE))]]
digit_terms_df <- subset(full_Tf_df, grepl("[[:digit:]]", term))
myprint_df(digit_terms_df)
return(full_Tf_df)
}
#print("RemovePunct:"); remove_punct_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, removeWords,
c(glb_append_stop_words[[txt_var]],
stopwords("english"))) #nstopwrds
#print("StoppedWords:"); stopped_words_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, stemDocument) #???
#print("StemmedWords:"); stemmed_words_Tf_df <- inspect_terms()
#stemmed_stopped_Tf_df <- merge(stemmed_words_Tf_df, stopped_words_Tf_df, by="term", all=TRUE, suffixes=c(".stem", ".stop"))
#myprint_df(stemmed_stopped_Tf_df)
#print(subset(stemmed_stopped_Tf_df, grepl("compan", term)))
#glb_corpus_lst[[txt_var]] <- txt_corpus
}
names(glb_corpus_lst) <- glb_txt_vars
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "extract.DTM"), major.inc=TRUE)
glb_full_DTM_lst <- list(); glb_sprs_DTM_lst <- list();
for (txt_var in glb_txt_vars) {
print(sprintf("Extracting TfIDf terms for %s...", txt_var))
txt_corpus <- glb_corpus_lst[[txt_var]]
# full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
# control=list(weighting=weightTf))
full_TfIdf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTfIdf))
sprs_TfIdf_DTM <- removeSparseTerms(full_TfIdf_DTM,
glb_sprs_thresholds[txt_var])
# glb_full_DTM_lst[[txt_var]] <- full_Tf_DTM
# glb_sprs_DTM_lst[[txt_var]] <- sprs_Tf_DTM
glb_full_DTM_lst[[txt_var]] <- full_TfIdf_DTM
glb_sprs_DTM_lst[[txt_var]] <- sprs_TfIdf_DTM
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "report.DTM"), major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Reporting TfIDf terms for %s...", txt_var))
full_TfIdf_DTM <- glb_full_DTM_lst[[txt_var]]
sprs_TfIdf_DTM <- glb_sprs_DTM_lst[[txt_var]]
print(" Full TermMatrix:"); print(full_TfIdf_DTM)
full_TfIdf_mtrx <- as.matrix(full_TfIdf_DTM)
rownames(full_TfIdf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_TfIdf_vctr <- colSums(full_TfIdf_mtrx)
names(full_TfIdf_vctr) <- dimnames(full_TfIdf_DTM)[[2]]
#grep("scene", names(full_TfIdf_vctr), value=TRUE)
#which.max(full_TfIdf_mtrx[, "yearlong"])
full_TfIdf_df <- as.data.frame(full_TfIdf_vctr)
names(full_TfIdf_df) <- "TfIdf.full"
full_TfIdf_df$term <- rownames(full_TfIdf_df)
full_TfIdf_df$freq.full <- colSums(full_TfIdf_mtrx != 0)
full_TfIdf_df <- orderBy(~ -TfIdf.full, full_TfIdf_df)
print(" Sparse TermMatrix:"); print(sprs_TfIdf_DTM)
sprs_TfIdf_vctr <- colSums(as.matrix(sprs_TfIdf_DTM))
names(sprs_TfIdf_vctr) <- dimnames(sprs_TfIdf_DTM)[[2]]
sprs_TfIdf_df <- as.data.frame(sprs_TfIdf_vctr)
names(sprs_TfIdf_df) <- "TfIdf.sprs"
sprs_TfIdf_df$term <- rownames(sprs_TfIdf_df)
sprs_TfIdf_df$freq.sprs <- colSums(as.matrix(sprs_TfIdf_DTM) != 0)
sprs_TfIdf_df <- orderBy(~ -TfIdf.sprs, sprs_TfIdf_df)
terms_TfIdf_df <- merge(full_TfIdf_df, sprs_TfIdf_df, all.x=TRUE)
terms_TfIdf_df$in.sprs <- !is.na(terms_TfIdf_df$freq.sprs)
plt_TfIdf_df <- subset(terms_TfIdf_df,
TfIdf.full >= min(terms_TfIdf_df$TfIdf.sprs, na.rm=TRUE))
plt_TfIdf_df$label <- ""
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "label"] <-
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"]
glb_important_terms[[txt_var]] <- union(glb_important_terms[[txt_var]],
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"])
print(myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
colorcol_name="in.sprs") +
geom_text(aes(label=label), color="Black", size=3.5))
melt_TfIdf_df <- orderBy(~ -value, melt(terms_TfIdf_df, id.var="term"))
print(ggplot(melt_TfIdf_df, aes(value, color=variable)) + stat_ecdf() +
geom_hline(yintercept=glb_sprs_thresholds[txt_var],
linetype = "dotted"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, !is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(melt_TfIdf_df, "term", "value",
colorcol_name="variable"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(head(melt_TfIdf_df, 10), "term", "value",
colorcol_name="variable"))
}
# sav_full_DTM_lst <- glb_full_DTM_lst
# sav_sprs_DTM_lst <- glb_sprs_DTM_lst
# print(identical(sav_glb_corpus_lst, glb_corpus_lst))
# print(all.equal(length(sav_glb_corpus_lst), length(glb_corpus_lst)))
# print(all.equal(names(sav_glb_corpus_lst), names(glb_corpus_lst)))
# print(all.equal(sav_glb_corpus_lst[["Headline"]], glb_corpus_lst[["Headline"]]))
# print(identical(sav_full_DTM_lst, glb_full_DTM_lst))
# print(identical(sav_sprs_DTM_lst, glb_sprs_DTM_lst))
rm(full_TfIdf_mtrx, full_TfIdf_df, melt_TfIdf_df, terms_TfIdf_df)
# Create txt features
if ((length(glb_txt_vars) > 1) &&
(length(unique(pfxs <- sapply(glb_txt_vars,
function(txt) toupper(substr(txt, 1, 1))))) < length(glb_txt_vars)))
stop("Prefixes for corpus freq terms not unique: ", pfxs)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DTM"),
major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Binding DTM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
txt_X_df <- as.data.frame(as.matrix(glb_sprs_DTM_lst[[txt_var]]))
colnames(txt_X_df) <- paste(txt_var_pfx, ".T.",
make.names(colnames(txt_X_df)), sep="")
rownames(txt_X_df) <- rownames(glb_allobs_df) # warning otherwise
# plt_X_df <- cbind(txt_X_df, glb_allobs_df[, c(glb_id_vars, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today", xcol_name=glb_rsp_var))
# log_X_df <- log(1 + txt_X_df)
# colnames(log_X_df) <- paste(colnames(txt_X_df), ".log", sep="")
# plt_X_df <- cbind(log_X_df, glb_allobs_df[, c(glb_id_vars, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today.log", xcol_name=glb_rsp_var))
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df) # TfIdf is normalized
#glb_allobs_df <- cbind(glb_allobs_df, log_X_df) # if using non-normalized metrics
}
#identical(chk_entity_df, glb_allobs_df)
#chk_entity_df <- glb_allobs_df
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DXM"),
major.inc=TRUE)
#stop("here"); sav_allobs_df <- glb_allobs_df
glb_punct_vctr <- c("!", "\"", "#", "\\$", "%", "&", "'",
"\\(|\\)",# "\\(", "\\)",
"\\*", "\\+", ",", "-", "\\.", "/", ":", ";",
"<|>", # "<",
"=",
# ">",
"\\?", "@", "\\[", "\\\\", "\\]", "^", "_", "`",
"\\{", "\\|", "\\}", "~")
txt_X_df <- glb_allobs_df[, c(glb_id_vars, ".rnorm"), FALSE]
txt_X_df <- foreach(txt_var=glb_txt_vars, .combine=cbind) %dopar% {
#for (txt_var in glb_txt_vars) {
print(sprintf("Binding DXM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
#txt_X_df <- glb_allobs_df[, c(glb_id_vars, ".rnorm"), FALSE]
txt_full_DTM_mtrx <- as.matrix(glb_full_DTM_lst[[txt_var]])
rownames(txt_full_DTM_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
#print(txt_full_DTM_mtrx[txt_full_DTM_mtrx[, "ebola"] != 0, "ebola"])
# Create <txt_var>.T.<term> for glb_important_terms
for (term in glb_important_terms[[txt_var]])
txt_X_df[, paste0(txt_var_pfx, ".T.", make.names(term))] <-
txt_full_DTM_mtrx[, term]
# Create <txt_var>.nwrds.log & .nwrds.unq.log
txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")] <-
log(1 + mycount_pattern_occ("\\w+", glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".nwrds.unq.log")] <-
log(1 + rowSums(txt_full_DTM_mtrx != 0))
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] <-
rowSums(txt_full_DTM_mtrx)
txt_X_df[, paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")] <-
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] /
(exp(txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")]) - 1)
# Create <txt_var>.nchrs.log
txt_X_df[, paste0(txt_var_pfx, ".nchrs.log")] <-
log(1 + mycount_pattern_occ(".", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".nuppr.log")] <-
log(1 + mycount_pattern_occ("[[:upper:]]", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".ndgts.log")] <-
log(1 + mycount_pattern_occ("[[:digit:]]", glb_allobs_df[, txt_var]))
# Create <txt_var>.npnct?.log
# would this be faster if it's iterated over each row instead of
# each created column ???
for (punct_ix in 1:length(glb_punct_vctr)) {
# smp0 <- " "
# smp1 <- "! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~"
# smp2 <- paste(smp1, smp1, sep=" ")
# print(sprintf("Testing %s pattern:", glb_punct_vctr[punct_ix]))
# results <- mycount_pattern_occ(glb_punct_vctr[punct_ix], c(smp0, smp1, smp2))
# names(results) <- NULL; print(results)
txt_X_df[,
paste0(txt_var_pfx, ".npnct", sprintf("%02d", punct_ix), ".log")] <-
log(1 + mycount_pattern_occ(glb_punct_vctr[punct_ix],
glb_allobs_df[, txt_var]))
}
# print(head(glb_allobs_df[glb_allobs_df[, "A.npnct23.log"] > 0,
# c("UniqueID", "Popular", "Abstract", "A.npnct23.log")]))
# Create <txt_var>.nstopwrds.log & <txt_var>ratio.nstopwrds.nwrds
stop_words_rex_str <- paste0("\\b(", paste0(c(glb_append_stop_words[[txt_var]],
stopwords("english")), collapse="|"),
")\\b")
txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] <-
log(1 + mycount_pattern_occ(stop_words_rex_str, glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".ratio.nstopwrds.nwrds")] <-
exp(txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] -
txt_X_df[, paste0(txt_var_pfx, ".nwrds", ".log")])
# Create <txt_var>.P.http
txt_X_df[, paste(txt_var_pfx, ".P.http", sep="")] <-
as.integer(0 + mycount_pattern_occ("http", glb_allobs_df[, txt_var]))
# Create user-specified pattern vectors
# <txt_var>.P.year.colon
txt_X_df[, paste0(txt_var_pfx, ".P.year.colon")] <-
as.integer(0 + mycount_pattern_occ("[0-9]{4}:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.daily.clip.report")] <-
as.integer(0 + mycount_pattern_occ("Daily Clip Report", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.fashion.week")] <-
as.integer(0 + mycount_pattern_occ("Fashion Week", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.first.draft")] <-
as.integer(0 + mycount_pattern_occ("First Draft", glb_allobs_df[, txt_var]))
#sum(mycount_pattern_occ("Metropolitan Diary:", glb_allobs_df$Abstract) > 0)
if (txt_var %in% c("Snippet", "Abstract")) {
txt_X_df[, paste0(txt_var_pfx, ".P.metropolitan.diary.colon")] <-
as.integer(0 + mycount_pattern_occ("Metropolitan Diary:",
glb_allobs_df[, txt_var]))
}
#sum(mycount_pattern_occ("[0-9]{4}:", glb_allobs_df$Headline) > 0)
#sum(mycount_pattern_occ("Quandary(.*)(?=:)", glb_allobs_df$Headline, perl=TRUE) > 0)
#sum(mycount_pattern_occ("No Comment(.*):", glb_allobs_df$Headline) > 0)
#sum(mycount_pattern_occ("Friday Night Music:", glb_allobs_df$Headline) > 0)
if (txt_var %in% c("Headline")) {
txt_X_df[, paste0(txt_var_pfx, ".P.facts.figures")] <-
as.integer(0 + mycount_pattern_occ("Facts & Figures:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.friday.night.music")] <-
as.integer(0 + mycount_pattern_occ("Friday Night Music", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.no.comment.colon")] <-
as.integer(0 + mycount_pattern_occ("No Comment(.*):", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.on.this.day")] <-
as.integer(0 + mycount_pattern_occ("On This Day", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.quandary")] <-
as.integer(0 + mycount_pattern_occ("Quandary(.*)(?=:)", glb_allobs_df[, txt_var], perl=TRUE))
txt_X_df[, paste0(txt_var_pfx, ".P.readers.respond")] <-
as.integer(0 + mycount_pattern_occ("Readers Respond", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.recap.colon")] <-
as.integer(0 + mycount_pattern_occ("Recap:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.s.notebook")] <-
as.integer(0 + mycount_pattern_occ("s Notebook", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.today.in.politic")] <-
as.integer(0 + mycount_pattern_occ("Today in Politic", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.today.in.smallbusiness")] <-
as.integer(0 + mycount_pattern_occ("Today in Small Business:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.verbatim.colon")] <-
as.integer(0 + mycount_pattern_occ("Verbatim:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.what.we.are")] <-
as.integer(0 + mycount_pattern_occ("What We're", glb_allobs_df[, txt_var]))
}
#summary(glb_allobs_df[ ,grep("P.on.this.day", names(glb_allobs_df), value=TRUE)])
txt_X_df <- subset(txt_X_df, select=-.rnorm)
txt_X_df <- txt_X_df[, -grep(glb_id_vars, names(txt_X_df), fixed=TRUE), FALSE]
#glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
}
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
#myplot_box(glb_allobs_df, "A.sum.TfIdf", glb_rsp_var)
# Generate summaries
# print(summary(glb_allobs_df))
# print(sapply(names(glb_allobs_df), function(col) sum(is.na(glb_allobs_df[, col]))))
# print(summary(glb_trnobs_df))
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(summary(glb_newobs_df))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
rm(log_X_df, txt_X_df)
}
## Loading required package: stringr
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
## label step_major step_minor bgn end
## 2 extract.features_factorize.str.vars 2 0 42.525 42.855
## 3 extract.features_process.text 3 0 42.855 NA
## elapsed
## 2 0.33
## 3 NA
## [1] "Building glb_txt_lst..."
## [1] "Remaining Acronyms in Headline:"
## character(0)
## [1] "Remaining Acronyms in Snippet:"
## [1] "In the 1864 election, Gotham delivered nearly twice as many votes to the presidents opponent, George B. McClellan."
## [2] "Researchers are finding more evidence that women who take S.S.R.I. depressants like Prozac and Zoloft increase the likelihood of a variety of health problems in their newborns."
## [3] "Eric T. Schneiderman, NewYorks attorney general, filed a suit on Tuesday that accuses Evans Bank of denying mortgages to African-Americans in Buffalo regardless of their credit."
## [4] "Jeffrey H. Knox, a senior federal prosecutor who butted heads with a number of WallStreet banks, is switching sides."
## [5] "In which Peter A. Collins disproves Aesop."
## [6] "Antonia M. Apps, the lead federal prosecutor in the criminal trial last year of Michael Steinberg, is taking a job with the law firm Milbank, Tweed, Hadley & McCloy."
## [7] "Scholarly musicians prove that Ph.D.s can play."
## [8] "In this new series, Chris Labzda and Bon Duke, the co-founders of the NewYork Fashion Film Festival, curate a short film each week for T. This weeks installment: A collage of classics overlaid with sound from Lauren Wolksteins Social..."
## [9] "I moved to Hudson, N.Y., to slow down and save money. I didnt know Id feel so lonely."
## [10] "A study suggests that exercise can help kids, especially those with A.D.H.D., focus in class."
## [11] "How should the USA respond to the killings of Steven J. Sotloff and James Foley?"
## [12] "Although banks are hiring armies of legal and compliance professionals in response to settlements and fines, they would do better to find ways to streamline their existing legal work, Geoffrey A. Moore and Mark Harris write in an Another View column."
## [13] "William T. Shermans 1864 campaign to take the Georgia city was one of the bloodiest of the Civil War."
## [14] "George Martin, author of the Song of Ice and Fire series, will be promoting his new book at the 92nd St. Y."
## [15] "The workers are OK."
## [16] "The Securities and Exchange Commission has chosen Tracey L. McNeil, an agency veteran and former corporate lawyer, to act as a liaison in resolving problems that retail investors may have with the agency."
## [17] "John A. Paulsons hedge fund suffered across-the-board losses in July and did only marginally better in August, according to an update to investors."
## [18] "Mr. Sheen will direct and perform in Dylan Thomass work he called it a play for voices on Oct. 26 at the 92nd Street Y."
## [19] "The acquisition is the latest move by Jeffrey R. Immelt, the chief of GE, to refocus the conglomerate on its core industrial businesses."
## [20] "The Chinese state news media has cited concerns that the USA is indoctrinating Chinese students by including its founding documents upholding freedom and human rights in the SAT."
## [21] "At the midpoint of the range, the Citizens Financial Group, based in Providence, R.I., would be valued at $13.4 billion."
## [22] "Mark P. Frissoras decision comes a few weeks after Carl C. Icahn disclosed an 8.48 percent stake in the rental car provider."
## [23] "Martin Lipton is virtually guaranteeing the continuation of policies opposed by many NYU faculty members, William D. Cohan writes in the Street Scene column."
## [24] "China Central Television used images of Condoleezza Rice in a report about a Beijing visit by Susan E. Rice, President Obamas national security adviser."
## [25] "If youve never heard the name Lloyd J. Austin, theres a good reason"
## [26] "This weeks video features Ismenia Mendes and David McElwee in a scene from A. R. Gurneys 1977 drama The Wayside Motor Inn."
## [27] "The media and financial data company Thomson Reuters is looking to sell peHUB, Buyouts and Venture Capital Journal, according to a report by peHUB."
## [28] "News headlines briefly flashed that Carl C. Icahn had raised his stake in Gannett, the media company, to nearly 9 percent. But as it turns out, his firm simply made a typo in a regulatory filing."
## [29] "Just two days after her USA Open victory, Serena Williams hosted her first ever fashion show for HSN."
## [30] "Nearly 70 years after they parted, a Chinese veteran is on a quest to find a Japanese woman he met shortly after World War II. His tale offers a warmer recollection of that era."
## [31] "In this lesson we offer a series of topics and questions paired with Times essays, articles, slide shows and videos to help students dig deeper into the causes, effects and overall legacy of World War I."
## [32] "A collection of material related to Rosa Parks, bought last month by Howard G. Buffett, is being lent to the Library of Congress for 10 years."
## [33] "A soldier takes time out of uniform to pursue an M.F.A. in writing, and discovers that what sets him apart, as a veteran at the keyboard, is not as important as the common ground he shares with other writers."
## [34] "Worn by the chef from July 29 to Aug. 4 while cooking at his two restaurants, Blue Hill in Manhattan and Blue Hill at Stone Barns in Pocantico Hills, N.Y."
## [35] "Gov. Andrew M. Cuomo and Mayor Bill de Blasio both enjoyed an Italian sausage at the Feast of SanGennaro in Little Italy on Saturday."
## [36] "Eike Batista was accused of manipulating the share price of his now-bankrupt petroleum company OGX."
## [37] "The activist investor William A. Ackmans new fund, Pershing Square Holdings, aims to list on the the Euronext Amsterdam exchange on Oct. 13."
## [38] "Fraud in the market for penny stocks continues unabated, two criminal cases filed last week show, Peter J. Henning writes in the White Collar Watch column."
## [39] "Quarries and connecting trenches were nothing less than small cities, where the walls bear witness to the people who fought and died in World War I."
## [40] "The deal for Vkontakte ends tensions between the social networks minority shareholders and Alisher B. Usmanov, the Russian billionaire who controls Mail.ru."
## [41] "Sears is borrowing $400 million from its chief, the billionaire Edward S. Lampert, through his hedge fund."
## [42] "KKRs purchase of Pioneers D.J. audio equipment business is the modern-day equivalent of providing gold-rush pickaxes to participants in the rave rush, Jeffrey Goldfarb writes in Reuters Breakingviews."
## [43] "Two grandchildren of Dwight D. Eisenhower say proposed changes by Frank Gehry to his memorial to the former president do not satisfy their concerns."
## [44] "In a letter to Electra Private Equity shareholders, the activist investor Edward J. Bramson said a change in approach in Electras strategy could increase the value of its shares by about $1.6 billion."
## [45] "Prosecutors in Brazil have argued that Mr. Batista profited from insider information when he sold shares in his now bankrupt petroleum company, OGX."
## [46] "Comments by Marshall L. Miller, the No. 2 official in the Justice Departments criminal division, reflect the agencys renewed interest in charging individual bank employees rather than just the banks."
## [47] "The top job at the database giant will be shared by Mark V. Hurd, now co-president, and Safra Catz, who is co-president and chief financial officer."
## [48] "Alibaba is set to start trading on the NewYork Stock Exchange under the ticker symbol BABA. | Markets showed relief that the United Kingdom would not face a tumultuous breakup. | Lawrence J. Ellison announced his retirement as chief executive of..."
## [49] "Average waist circumference but not B.M.I. has increased significantly in the USA, a new study reports."
## [50] "The British private equity firm accused Edward J. Bramsons Sherborne Investors Management of making unverifiable statements and unsubstantiated claims in its effort to reshape Electra."
## [51] "Alibaba soared in its public market debut. | Siemens agreed to buy the Dresser-Rand Group. | EMC weighed a deal with H.P. | Public pension funds may be souring on hedge funds."
## [52] "Venezuela may represent the Latin American and Caribbean nations and sit right next to the USA."
## [53] "Speaker John A. Boehner seemed to blame lazy Americans for the stalled economy."
## [54] "It was unclear whether President Petro O. Poroshenko would travel to NewYork to deliver his address as scheduled on Thursday."
## [55] "As the reclusive genius Richard D. James releases his first album in 13 years, the graphic designer with whom he has frequently collaborated discusses their work together."
## [56] "Vice President Joseph R. Biden Jr. met Carina Castro, the daughter of Julin Castro, secretary of Housing and Urban Development, at a reception honoring Hispanic Heritage Month."
## [57] "Attorney General Eric H. Holder Jr. will announce Tuesday that the federal government will end its fiscal year next week with 4,800 fewer prisoners, the first time since 1980 that the inmate population has declined from year to year."
## [58] "President Obama could end up leaving a smaller imprint on the judiciary than either President Bill Clinton or President George W. Bush."
## [59] "Ever wonder why jewelry stores hide the price tags? How invoice factoring works for small businesses. And why German companies are on a buying spree in the USA."
## [60] "Speaker John A. Boehner urges the new Veterans Affairs secretary to look to the private sector in reviewing how the department delivers health care to veterans."
## [61] "On this day in 1952, Richard M. Nixon gave his famous Checkers speech."
## [62] "To kick off Paris Fashion Week, the chic native whose shirting-based collection is stocked at Barneys NewYork and Dover Street Market shares a list of her favorite places around town with T."
## [63] "Gov. Paul R. LePage of Maine wants to help the NFL address its domestic violence problem."
## [64] "Maurice R. Greenberg, the former chief of AIG, has now raised several million dollars from three WallStreet investors to help cover the cost of the case."
## [65] "Electra said that two influential shareholder advisory services opposed efforts of the activist investor Edward J. Bramson, through his Sherborne Investors Management, to shake up the board of the British private equity firm."
## [66] "Representative James A. Traficant was seriously injured after a tractor accident on his family farm Tuesday night, according to local media reports in Youngstown, Ohio."
## [67] "The role of former President Gerald R. Ford in the filing of a friend-of-the-court brief in a Michigan affirmative action case."
## [68] "The insider trading case against Michael A. Lucarelli, a former executive at Lippert/Heilshorn and Associates, was relatively minor. But he got attention for running barefoot from the court in August."
## [69] "Judge Richard M. Berman of Federal District Court in Manhattan, before sentencing conservative author Dinesh DSouza to probation for violating federal campaign finance laws."
## [70] "Speaker John A. Boehner, an avid golfer, is gearing up for the Ryder Cup, the intense competition between the USA and Europe that begins Friday in Scotland."
## [71] "Speaker John A. Boehner tells First Draft that the new Congress should debate military action in Syria."
## [72] "A new poll has Senator Mark R. Warner, the Democratic incumbent in Virginia, up by 9 points over his Republican opponent."
## [73] "Eric H. Holder Jr.s resignation as attorney general means that President Obama is losing one the longest-serving members of his cabinet and one of his closest confidants."
## [74] "The Department of Labor is putting money behind its push to expand paid family leave in the USA. The issue could be a factor in coming elections if enough voters ask about it."
## [75] "Members of the NewYork Police Department turned out in force on Thursday for the funeral of Officer Michael Williams in LaGrangeville, N.Y."
## [76] "How big a fight will President Obama put up over Attorney General Eric H. Holder Jr.s successor? The question is captivating Washington."
## [77] "Eric H. Holder Jr. has resigned as attorney general, but hes in no hurry to go anywhere."
## [78] "William H. Gross, who built Pimco into one of the largest asset managers in the world, will join Janus Capital after a decision had been made for him to leave Pimco or be forced out, said a person briefed on the matter."
## [79] "William H. Gross, for decades an investment guru, was undone by the increasing complexity of managing his ever-growing fund."
## [80] "Two of Allergans largest shareholders, T. Rowe Price and Pentwater Capital Management, have broken their silence to insist that the company not strike any deals before a scheduled special meeting in December."
## [81] "As Yahoo tries to figure out what to do after raising $6 billion from its stake in the Alibaba Group, a prominent investor, Starboard Value, has emerged to offer some suggestions including buying AOL."
## [82] "Speaker John A. Boehner says Republicans must compete in Northeast"
## [83] "The ruling by Judge Thomas P. Griesa of the Federal District Court in Manhattan allows Citigroup to make a $5 million payment to bondholders."
## [84] "Remembering James A. Trafficant, in videos and soundbites."
## [85] "The low-power computer server that has been seen as a way to revive Hewlett-Packard has never lived up to expectations. Still, HP keeps trying, this time with a chip made by ARM."
## [86] "William H. Gross abruptly leaves Pimco. | AIG case sheds new light on bailout. | DreamWorks Animation in sale talks with SoftBank. | Allergan shareholders speak out."
## [87] "Judge Thomas P. Griesa of Federal District Court in Manhattan stopped short of issuing sanctions, saying he would make a decision about them in the future."
## [88] "A suit filed by Stephen A. Wynn, the casino impresario, against an investor he accuses of slander is the latest salvo in the growing tensions between vocal shareholders and corporations."
## [89] "Kyle T. Dolan spins us a puzzle."
## [90] "The move, which the activist investor Carl C. Icahn had called for, will cleave eBay almost in half and separate it from a company that generates almost half its revenue."
## [91] "The team behind the popular Lower East Side restaurant are publishing a collection of 100 seasonal dishes meant to provide both information and inspiration. Here, they share a recipe with T."
## [92] "Elizabeth C. Gorski gives us the lowdown."
## [93] "The British media and marketing firms acquisition of Advanstar, a private company, would create the largest events organizer in the USA."
## [94] "The initial public offering of Pershing Square Holdings is expected to give the activist investor William A. Ackman a permanent pool of capital to make bigger, bolder bets."
## [95] "James R. Clapper Jr., the director of national intelligence, sent a message to employees on Tuesday defending the nations spy agencies against criticism."
## [96] "David A. ONeil has not announced his next step, though he will probably swing through Washingtons revolving door."
## [97] "A Hungarian musicologist finds Mozarts own score of the Piano Sonata in A, K. 331."
## [98] "After 20 years on the board of the Alvin Ailey Dance Foundation, Joan H. Weill, its chairwoman, is expected to announce Thursday that she is stepping down at the end of the year."
## [99] "Several recent bankruptcy cases and news events have shown that municipal and corporate debt securities have faced issues in pricing and trading, Stephen J. Lubben writes in an In Debt column."
## [100] "The countrys largest banks are rolling out a new service that could help them lessen their reliance on expensive data terminals like those sold by Bloomberg L.P."
## [101] "What White House security breaches happened during the presidencies of Franklin D. Roosevelt, Herbert Hoover and Ronald Reagan?"
## [102] "Politicians and NewYorks elite gathered Wednesday for the Alfred E. Smith charity dinner and roast."
## [103] "George W. Bush talks about his brother Jebs presidential aspirations."
## [104] "Im interested in buying a fairy house, I told the first branch of the phone tree at Bank of America Home Loans. In August, the company agreed to pay a $16.65 billion penalty for its role in selling bonds based on make-believe mortgages. I..."
## [105] "Michael R. Bloomberg and two Disney princesses."
## [106] "Former President George W. Bush said on Thursday that he had been lobbying his younger brother Jeb to throw his hat in the ring in 2016"
## [107] "The government said the criminal charges against Michael J. Coscia, founder of Panther Energy Trading, were the first to be brought under new rules that bar a type of abusive trading called spoofing."
## [108] "The Solomon R. Guggenheim Museum is planning the construction of a new building somewhere in NewYork City that will be used for offices, art storage and some public programming."
## [109] "Speaker John A. Boehner has picked Elise Stefanik, who is running for an open House seat in upstate NewYork, to deliver the partys radio address on Saturday."
## [110] "As the billionaire Kenneth C. Griffin battles his wife in divorce court, he has drawn back the curtain on one of the most prominent marriages in hedge fund world by disclosing terms of their prenuptial agreement."
## [111] "Jesse C. Litvak had been one of the few people convicted of fraud over the bailout of WallStreet, but an appeals court suggested that his conviction was likely to be overturned."
## [112] "Before my son started school, I shared my parents hippie view of the P.T.A. as a perfect-parent filled hassle and just another part of The Establishment trying to squelch creativity out of learning."
## [113] "Vice President Joseph R. Biden Jr. held a question and answer session at Harvard on Thursday."
## [114] "The investor Edward J. Bramson had been pushing to shake up the board of the British private equity firm Electra Private Equity and had sought two seats on its board."
## [115] "Including hard terms for AIG was politically necessary to getting the TARP program going, but Henry M. Paulson Jr. said he did support the bailout package and its terms."
## [116] "These D.I.Y. magazines are on the rise in LosAngeles, where artists are using them as a showcase for their work."
## [117] "A growing body of research indicates that many people who react to gluten may be suffering a condition called non-celiac gluten sensitivity, or NCGS."
## [118] "A big hurdle in the spoofing case against a high-frequency trading firm is that a jury must decide whether one computer fooling another is a crime, Peter J. Henning writes in the White Collar Watch column."
## [119] "Just 24 percent of respondents to a Pew Research Center poll correctly picked Janet L. Yellen as the Feds chairwoman from a list of four names."
## [120] "Maurice R. Greenberg, AIGs former chief and a large shareholder, has spun a ludicrous tale in court that the bailout of the insurer was unfair to its investors."
## [121] "Duncan L. Niederauer, who recently retired as head of the NewYork Stock Exchange, has joined an upstart brokerage firm called Battery East, which aims to help employees of privately held companies sell their shares."
## [122] "Big banks face another round of USA charges. | Henry M. Paulson Jr. testifies that punitive AIG terms were necessary. | Investors cheer the breakup of HP. | Lawyers for Goldman and Libyas sovereign fund clash in court."
## [123] "Timothy F. Geithner, the former Treasury secretary, was a witness in the trial of a lawsuit over the departments role in the bailout of AIG His book was under scrutiny, too."
## [124] "By chronicling the building blocks of technological advances, Walter Isaacsons The Innovators hopes to teach us about the nature of innovation, Jonathan A. Knee writes in a review, the debut Book Entry column."
## [125] "President Obama continues to get low marks on his handling of the threat from the Islamic State, also known as ISIS or ISIL."
## [126] "Valeant and Pershing Square are planning to raise their offer for Allergan. | Profit at Goldman less easy to find. | AIG trial puts Timothy F. Geithner on the hot seat. | Glencores chief pursues Rio Tinto."
## [127] "A derivatives rule is set to change. | JPMorgans data breach causes alarm. | Chinese companies are scooping up real estate across the globe. | Former Treasury Secretary Timothy F. Geithner defends the bailout of AIG"
## [128] "The $2.6 billion deal, which includes debt, is the latest transaction to reshape the drug industry. Auxilium is terminating its deal to acquire QLT."
## [129] "Carl C. Icahn, the billionaire activist investor, sent a letter to Timothy Cook, the chief executive of Apple, saying that the company is hugely undervalued."
## [130] "Critics want a war memorial to remove a sculpture based on Alfred Eisenstaedts famous Life Magazine photo of a sailor kissing a nurse at the end of World War II."
## [131] "One way to level the playing field is to simply repeal the safe harbors in the bankruptcy code, writes Stephen J. Lubben in the In Debt column."
## [132] "Facebook has a new local mobile advertising play. J.C. Penney is in the midst of an e-commerce renaissance."
## [133] "The AM1 Supreme by MARCH LA.B bears the hallmarks of vintage watches, but with a streamlined aesthetic."
## [134] "The government acknowledged there was scant legal precedent for its demand for $1.6 million in restitution from Jesse C. Litvak, who was convicted of securities fraud in March."
## [135] "Ben S. Bernanke, the former Federal Reserve chairman, kept his answers brief on the stand in the lawsuit over the 2008 bailout of American International Group."
## [136] "A sluggish global outlook sends ripples through the markets. | Tuning up UBSs investment bank. | Symantec announces a split. | Ben S. Bernanke defends the AIG bailout."
## [137] "The business will merge with a fledgling firm founded by Paul J. Taubman, the former Morgan Stanley investment banker, who will run the combined firm as chief executive and chairman."
## [138] "If you want to time-travel to the 1990s, youll get a chance at 1 p.m. today, when the William J. Clinton Library releases 10,000 more pages of previously undisclosed documents from the Clinton years."
## [139] "Field + Supply, founded by the interior designer Brad Ford, launches this weekend in a barn in High Falls, N.Y. Here, a look at some of the furniture and objects on offer."
## [140] "Ben Ratliff and Jon Caramanica discuss the singers Tinashe and FKA twigs, and whether they point toward a new conception of R&B."
## [141] "The Army War College rescinded the masters degree of Senator John E. Walsh on Friday, determining that the Montana Democrat plagiarized his final paper there in 2007."
## [142] "A restaurant learns what happens when you tell customers to pay what God wants them to pay, Microsofts chief executive says its O.K. for women to ask for a raise, and new restaurants are making Detroit a culinary oasis."
## [143] "A new USAO. lounge for service members and their families has opened at Terminal 5 at Kennedy Airport in NewYork."
## [144] "Vice President Spiro T. Agnew resigned on this day in 1973."
## [145] "Holly Fallon, whose stage name is Dollicious, and Miriam Hintz, who goes by Haru, posed in the lower courtyard of the Jacob K. Javits Convention Center on Friday at NewYork Comic Con."
## [146] "Two cases raise the question about what role the courts should play in policing negotiations and the limits that can be applied to the tactics one side can use in making a deal, Peter J. Henning writes in the White Collar Watch column."
## [147] "The awards will be announced on Nov. 23 in a show broadcast by ABC."
## [148] "A 28 year old digital communications director is helping to turn Speaker John A. Boehner, no ones idea of a digital-age guru, into a YouTube star."
## [149] "Abigail Johnson, 52, who is president of the parent company of Fidelity Investments, will succeed her father, Edward C. Johnson III, as chief executive."
## [150] "Photos from HongKong, Syria, Yemen and the USA."
## [151] "Adam G. Perl puns his way through our Tuesday puzzle."
## [152] "JPMorgan Chase posts a third quarter profit. | Warren E. Buffett promotes the Berkshire Hathaway brand. | Calculating the cost of Ebola | Derivatives change only goes so far."
## [153] "Denis J. McInerney, a former deputy assistant attorney general in the Justice Department, has returned to the law firm Davis Polk & Wardwell."
## [154] "President Obama has decided to wait until after next months midterm elections to nominate a replacement for Attorney General Eric H. Holder Jr., White House officials said."
## [155] "Some of it is OK."
## [156] "Highlights from the International Herald Tribune archives: Nikita S. Khrushchev was believed to be ousted as the Soviet leader in 1964."
## [157] "Hopper Drawing, by Carter E. Foster, was published by the Whitney Museum of American Art in connection with a Hopper exhibition in 2013."
## [158] "Airbnb, the pioneering home rental service, presents itself as useful and virtuous, but the reality is far less benign, according to Attorney General Eric T. Schneiderman of NewYork"
## [159] "The FBI director, James B. Comey, corrected statements he made on 60 Minutes that the bureau did not do electronic surveillance without a court order."
## [160] "Ben C. Solomon is a Times video journalist reporting on Ebola. His video today, about a team of ambulance drivers in Monrovia, Liberia, shows the dangers they face every day."
## [161] "James L. Amine and Timothy P. OHara have been appointed to the Swiss lenders executive board and will lead the investment banking division with Gal de Boissard, the chief executive for Europe, the Middle East and Africa."
## [162] "Food news from Silicon Valley to the UK."
## [163] "The venture capitalist Marc Andreessen is stepping down from eBays board, months after publicly defending his role as a director against attacks from Carl C. Icahn."
## [164] "Best known for his operas and symphonies, Mr. Glasss Piano tudes are the focus of a new recording and concerts at BAM."
## [165] "An SEC case against a high-frequency trading firm shows how difficult it is to draw the line between acceptable trading strategies and manipulation, Peter J. Henning writes in the White Collar Watch column."
## [166] "Vice President Joseph R. Biden Jr. appeared in a hangar near La Guardia Airport on Monday to endorse Gov. Andrew M. Cuomos plans for commissioning a redesign of four airports in and around NewYork City."
## [167] "Senator Patrick J. Leahy, chairman of the Judiciary Committee, is asking Comcast not to engage in paid prioritization of Internet content after its proposed takeover of Time Warner Cable."
## [168] "Tom Steyer, a billionaire hedge fund founder, has become the largest super PAC donor of all time, passing the casino magnate Sheldon G. Adelson."
## [169] "The piano superstar plays the solo cadenza for the first movement of Mozarts Piano Concerto No. 17 in G."
## [170] "If you are wondering why you have been getting fund-raising pitches from the Democratic Congressional Campaign Committee theres at least one explanation: an organization you did provide your email to has rented its list to the WashingtonDCC.C."
## [171] "Through a clever bit of photo manipulation, a new ad by the NRA puts Iowas Democratic Senate nominee, Bruce Braley, next to the man many gun owners consider their nemesis: Michael R. Bloomberg."
## [172] "Photos from SouthAfrica, HongKong, India and the USA."
## [173] "Vice President Joseph R. Biden Jr. beat the comics to the punch line at a recent Kennedy Center appearance."
## [174] "A federal judge ordered a full mental evaluation in the next 30 days of Omar J. Gonzalez, the man accused of jumping the fence and racing past Secret Service officers into the White House with a knife last month."
## [175] "Highlights from the International Herald Tribune archives: French farmers disagreed with General de Gaulles threat to quit the E.E.C. in 1964."
## [176] "The Feds policies that drove down interest rates to historically low levels have actually exacerbated the inequality problem that Janet L. Yellen said concerned her, writes William D. Cohan in Street Scene."
## [177] "Ben Bradlees funeral will be held Tuesday at the Washington National Cathedral, where Richard M. Nixon was remembered in a memorial service in 1994."
## [178] "NewYorkTimess David E. Sanger remembers a conversation with Ben Bradlee about an article in NewYorkTimes."
## [179] "At an investment conference, Scott A. Mather, who now oversees Pimcos Total Return Fund, defends its outsize holdings in European government bonds and other high-yielding securities."
## [180] "Children who drink rice, almond or soy milk instead of cows milk may have insufficient levels of vitamin D."
## [181] "A man later identified as a J.P. Morgan managing director walked into a live, online broadcast of a group of prominent protesters on Wednesday to vent his frustration with the continuing occupation of key locations in HongKong."
## [182] "Gov. Andrew M. Cuomo of NewYork foresees something really, really, really big for Hillary Rodham Clintons future."
## [183] "Richard Norton Smiths biography of Nelson Rockefeller is a portrait of the world when establishment financiers earned political respect, writes Jonathan A. Knee in a book review."
## [184] "A senior administration official confirms that Kathyrn Ruemmler, the former White House counsel, has taken herself out of the running to succeed Eric H. Holder Jr. as attorney general."
## [185] "The festival of theater, dance and performance art presented by P.S. 122 will take place at spaces around the city, while its home on First Avenue is renovated."
## [186] "Ben C. Solomon has been making videos about the Ebola outbreak, finding scenes of courage even as people struggle to contain the disease."
## [187] "Most European banks pass stress test. | Steven Ballmer could claim huge tax benefits on his LosAngeles Clippers deal. | William A. Ackmans outsize bets. | Court ruling disarms shareholders."
## [188] "Arguably the greatest feat of arms in American military history was performed on the Roanoke River at Plymouth, N.C., in the predawn darkness of Oct. 28, 1864."
## [189] "One side threatens to crack down harder; the other side complains about too much enforcement. The question is whether both sides can be satisfied, Peter J. Henning writes in the White Collar Watch column."
## [190] "Plus, El Anatsuis shimmering curtains, Martin Z. Marguliess photography collection and more art events in the week ahead."
## [191] "Vice President Joseph R. Biden campaigns for Representative Bruce Braley in Iowa on Monday."
## [192] "D.J. Tim Sweeneys cult-favorite program turns 15 next year. To celebrate, he has compiled a new double album, a song from which premieres here."
## [193] "William J. Burns, who just stepped down as deputy secretary of state, is being named president of the Carnegie Endowment for International Peace."
## [194] "Timothy D. Cook, Apples chief executive, said that one million credit cards had been activated on Apple Pay in the first three days that the mobile payment system was live."
## [195] "The information of more than 18.5 million California residents was compromised in 2013, according to Kamala D. Harris, Californias attorney general."
## [196] "An Apple executive recently called the design of Xiaomis phones theft. In return, Mr. Barra pointed out that parts of Apples new phones are a bit like handsets from HTC."
## [197] "Elizabeth C. Gorski cooks up a funny midweek puzzle for us."
## [198] "Michael E. Shapiro, who transformed and enlarged the collection of Atlantas High Museum, will leave next year."
## [199] "Sales of Gov. Andrew M. Cuomos memoir fell by more than 43 percent to 535 copies in its second week on shelves."
## [200] "Kering announced a new sustainable fashion program and the C.F.D.A. and Lexus announced the winners for their Eco-Fashion challenge."
## [201] "Quinn Bradlee, a son of Benjamin C. Bradlee, the former executive editor of The Washington Post, rested his head on the coffin after he spoke at his fathers funeral at Washington National Cathedral on Wednesday."
## [202] "He had claimed that the game publisher Activision Blizzard used his image without his permission in Call of Duty: Black Ops II."
## [203] "Senator Angus King, the Maine independent who had backed another independent, Eliot Cutler, for governor, switched his allegiance to the Democrat, Representative Michael H. Michaud."
## [204] "Photos from Turkey, China, India and the USA."
## [205] "Thomas M. Menino, who served as Bostons mayor for 20 years, has died at the age of 71."
## [206] "Neil L. Rudenstine has presided over the library along with Anthony W. Marx, its president during planning for the institutions controversial renovation."
## [207] "Tim Cooks public announcement that he is gay has been met with support, but some point out that the battle for L.G.B.T. equality at work is far from over."
## [208] "David W. Dunlap has worked at NewYorkTimes for 39 years. He recalls the early news coverage of AIDS and compares it to NewYorkTimess coverage of the arrival of the Ebola virus in the USA."
## [209] "Democratic nostalgia seemed to be the theme of the night as former President Bill Clinton campaigned on Thursday for Gov. Andrew M. Cuomo."
## [210] "The French banks results were a turnaround from the previous quarter, when it took a charge of nearly 6 billion euros for legal penalties in the USA."
## [211] "Albert G. Horvath, the Smithsonian Institutions current senior finance official, will serve as secretary until David J. Skorton can take up his permanent role in July."
## [212] "James B. Stewart provides insight into his column about the Apple chief executives announcement that he is proud to be gay."
## [213] "The return of Janet R. Bender."
## [214] "On Tuesday, Michael S. Dell will try to persuade people that his company is about far more than the personal computers and computer servers it has been known for, with products intended for things as varied as the cloud computing networks of global..."
## [215] "The government has imposed billions of dollars in penalties on the big banks for wrongdoing. But, Peter J. Henning writes in the White Collar Watch column, that may not be enough to change banks behavior."
## [216] "C.H.C.M., the influential Bond Street shop, is debuting an in-house range of refined wardrobe essentials."
## [217] "Photos from Turkey, Syria, India and the USA."
## [218] "Relations between tech companies and the intelligence community has been strained recently, but Adm. Michael S. Rogers played down government concerns."
## [219] "Highlights from the International Herald Tribune archives: Europeans were happy Lyndon B. Johnson was elected in 1964."
## [220] "Millstein & Company, the financial advisory firm founded by James E. Millstein, has hired Mark Walker, a former banker with Rothschild."
## [221] "Despite efforts to improve disclosure under the Freedom of Information Act, the SEC continues to resist sharing data with the public, William D. Cohan writes in the Street Scene column."
## [222] "William A. Ackmans hedge fund could vote its stake in Allergan in support of ousting directors at a meeting scheduled for Dec. 18, bolstering his effort to force the company into a sale."
## [223] "Some post-shellacking advice for the POTUS."
## [224] "Photos from Israel, Algeria, Russia and the USA."
## [225] "John L. Thornton, the chairman of Barrick Gold and a professor at Tsinghua University in Beijing, was designated as a possible successor to Henry M. Paulson Jr. when they were at Goldman Sachs."
## [226] "A. Jerrold Perenchio, the former chairman of Univision, pledged to donate about 50 artworks including notable Impressionist paintings "
## [227] "Rita Doves poem November for Beginners, and the article Heavy Autumn Snowstorm Barrels Across Northeast, by Al Baker, Elizabeth A. Harris and Sarah Maslin Nir, appear in this pairing."
## [228] "An emboldened Speaker John A. Boehner warned President Obama against unilateral action to overhaul the immigration system."
## [229] "The 82 year old has been recording with her daughter, Patsy L. Russell, and John Carter Cash, for a project combining new songs and old folk and gospel tunes."
## [230] "The designation of MetLife as systemically important is a sign that regulators are looking ahead to the next possible crisis, Stephen J. Lubben writes in the In Debt column."
## [231] "The Democratic Senatorial Campaign Committee cancels $2 million worth of ads on behalf of Senator Mary L. Landrieu of Louisiana."
## [232] "Photos from Ukraine, Syria, Scotland and the USA."
## [233] "Barry C. Silk writes puzzles for grown-ups, but once in a while you can find stuff for less mature adults like me."
## [234] "The Sony Pictures Classics co-president will take over from Herbert S. Schlosser, who will be chairman emeritus."
## [235] "Vice President Joseph R. Biden Jr. spoke quickly on his relationship with Prime Minister Benjamin Netanyahu of Israel during remarks to the Jewish Federation."
## [236] "Mr. Morrison will play J.M. Barrie in the show about the creation of Peter Pan; he and Jeremy Jordan have both played the role in earlier productions."
## [237] "The term political correctness is perhaps most often used by its detractors but new research has found that thinking about being P.C. can actually improve creativity."
## [238] "Senator Patrick J. Leahy said he believes Loretta E. Lynch, President Obamas choice for attorney general, should be confirmed easily no matter when the Senate votes."
## [239] "By analyzing samples from each of the five NewYork City boroughs, Jane M. Carlton hopes to detect viruses before an outbreak."
## [240] "Is the food label \"natural\" misleading to consumers? Should the F.D.A. enforce a definition of the term?"
## [241] "Another small sampling of sparkling prose in recent editions, from serious to silly to Kenny G."
## [242] "President Obama and President Vladimir V. Putin of Russia discussed Iran, Syria and Ukraine during three brief conversations at the Asia-Pacific Economic Cooperation summit meeting."
## [243] "Andrea C. Bonomi, the Italian businessman battling a Chinese French bid for Club Mditerrane, announces a sweetened offer for the resort company."
## [244] "Reporters were surprised on Tuesday morning when the Federalist Society, a conservative legal group, announced that the news media would be barred from an appearance by Justice Samuel A. Alito Jr. at its annual gala black-tie dinner."
## [245] "Heaven, the first offering from the Birmingham raised, LosAngeles-based 24 year old T.O.L.D., premieres here."
## [246] "The activist investor William A. Ackman has acquired an 8.5 percent economic interest in the former animal health arm of Pfizer."
## [247] "Photos from France, China, Poland and the USA."
## [248] "Both steps fall short of the big breakup of Dow Chemical being advocated by the activist hedge fund manager Daniel S. Loeb."
## [249] "Mr. Grammer is to play the largely comic role of Charles Frohman, the theater producer of Peter Pan playwright J.M. Barrie."
## [250] "Photos from Turkey, HongKong, Afghanistan and the USA."
## [251] "Steven A. Ballmer, Microsofts former chief executive, is making a financial contribution that will enable the school to expand its computer science faculty by about 50 percent."
## [252] "Dassault Systmes, a French company, has a five year research agreement with the F.D.A. focused on using 3 D technology for simulating the reliability of pacemaker wires."
## [253] "Its in the bullet points: Shoes, bags, more mobile and social media, and a focus on the USA."
## [254] "Berkshire Hathaway, the conglomerate run by Warren E. Buffett, will acquire Duracell using a transaction aimed at lowering the tax bill."
## [255] "President Obama and Prime Minister Dmitri A. Medvedev of Russia at the East Asia summit plenary session in Naypyidaw, Myanmar, on Thursday."
## [256] "Readers wrote about a public reckoning for our wars in Iraq and Afghanistan and the sacrifices of our troops in comments on Daniel P. Bolgers Op-Ed, The Truth About the Wars."
## [257] "Warren E. Buffett has an appetite for large and creatively structured acquisitions. A look back at some of his recent deals."
## [258] "Advanced Placement courses can be a real boon to students motivated by intellectual curiosity and a love of learning. But for students looking to please their parents or for those in pursuit of transcript padding and other false academic idols, AP..."
## [259] "Thomas O. Kriegsmann is to be its director of programs, a new position that will include artist development projects."
## [260] "David W. Dunlap, a Metro reporter for NewYorkTimes, took us back to the moment when the paper stopped being produced with Linotype machines."
## [261] "Daniel S. Loeb, the activist hedge fund manager, has been pressing for a breakup of Dow Chemical and has been rebuffed in his attempt to name two directors."
## [262] "More than $40 million has been raised in honor of Joan H. Weill, who is stepping down as chairwoman of the Alvin Ailey Dance Foundation."
## [263] "Net neutrality could give Comcast a way out of its Time Warner Cable acquisition. | Warren E. Buffetts deal for Duracell satisfies a number of his deal-making preferences. | Virgin America to begin trading. | Goldman seeks to lure tech talent."
## [264] "Evan R. Chesler, the chairman of the law firm Cravath, Swaine & Moore, will take over as the library chairman from Neil L. Rudenstine."
## [265] "Finish Big and Succession, examine how entrepreneurs and corporate chieftains alike can be assessed on their departures, Jonathan A. Knee writes in a joint book review."
## [266] "Senator Mary L. Landrieus campaign has not hesitated to embrace her stance on the XL Pipeline ahead of her runoff election against Representative Bill Cassidy."
## [267] "Celebrities should not quit their day jobs, according to Andrew J. Ries."
## [268] "And what about that I.S.I. dreamboat Aasar Khan?"
## [269] "The alliance between the American and German companies is expected to strengthen Mercks oncology business in several important markets, including the USA."
## [270] "The animal health care company Zoetis, which has attracted the interest of the deal maker William A. Ackman, has made a deal of its own."
## [271] "The $66 billion acquisition by Actavis would be the largest deal this year and would thwart the takeover effort of Valeant and William A. Ackman."
## [272] "Representative Nancy Pelosi has a birthday wish for Speaker John A. Boehner."
## [273] "Insider trading cases are casting a spotlight on whether courts should defer to administrative agencies in determining criminal violations, Peter J. Henning writes in the White Collar Watch column."
## [274] "Representative Steve Israel joked about handing off the WashingtonDCC.C. chairmanship."
## [275] "Mayor Francis G. Slay said troops operating within the city of SaintLouis would have a secondary role in responding to any protests, and that police officers would be the ones dealing directly with demonstrators."
## [276] "What do you do when you are a billionaire and a movie buff? If you are Paul G. Allen, you turn the iconic theater you already own into a state-of-the-art venue."
## [277] "William A. Ackman and Valeant Pharmaceuticals lost their hostile battle to buy Allergan and still came away with a total of $2.6 billion."
## [278] "WallStreet is on pace for a big year of deal-making. | William A. Ackman may have lost Allergan, but he still came away a winner. | The Justice Department is weighing a civil suit against Angelo R. Mozilo, the former chief executive of Countrywide..."
## [279] "A briefing book for the White House chief of staff, Denis R. McDonough, had a note from one of his children attached to it as he sat in a meeting on Ebola in the White House on Tuesday."
## [280] "Much like their leader, Warren Buffett, the chief executives of Berkshire Hathaways subsidiaries are a savvy group of acquirers. Lawrence A. Cunningham, in the Another View column, explains why."
## [281] "American Ballet Theater and the Segerstrom Center for the Arts in Costa Mesa, Calif., will open the American Ballet Theater William J. Gillespie School on the Segerstrom campus in September, both organizations announced on Tuesday."
## [282] "Assembling the tale of G.M.s faulty ignition switch, which led to the deaths of at least 13 people, took months of digging by a team of reporters and editors."
## [283] "Joseph P. Clancy, acting director of the Secret Service, is to testify before Congress."
## [284] "Thomas B. Leonardi will be joining the investment bank as a senior adviser, focusing on the insurance industry."
## [285] "The release of an interview Robert Rubin gave nine years ago offers a window into the thoughts of the rich and powerful, William D. Cohan writes in the Street Scene column."
## [286] "On Saturday, the Arcangel-composed series Dances for the Electric Piano will be staged for the first time in the USA. Listen to it here."
## [287] "Representative Paul D. Ryan, the new chairman of the Ways and Means Committee, said that a broad overhaul of the tax code depended on the administrations engagement."
## [288] "Gov. Bobby Jindal of Louisiana tried to cut off an immigration discussion with Chuck Todd of NBC."
## [289] "The country plans to scrap its state monopoly on the sale of salt, a move that would mark the end of a system that can be traced back to 685 B.C."
## [290] "Despite being about to receive his Ph.D. in physics from Harvard, Kevin Niu plans to eschew the academic life to follow his calling in the entertainment industry."
## [291] "Daniel K. Tarullo, the Fed governor, described to a Senate subcommittee potential rules that could restrict banks from some types of commodities operations."
## [292] "Speaker John A. Boehner addressed President Obamas immigration actions."
## [293] "Is the Federal Reserve up to the task of regulating financial institutions that are so large and complex? the chairman of a Senate panel asked William C. Dudley."
## [294] "Attorney General Eric H. Holder Jr. offered guidance to law enforcement agencies and urged protesters to be restrained."
## [295] "Vice President Joseph R. Biden Jr. had to beat a hasty retreat on Friday at the sight of an unruly crowd of Ukrainians in Kiev protesting the new government amid continuing unrest in that former Soviet republic."
## [296] "Paul J. Taubman is bringing Don Cornwell, a managing director who specializes in sports team deals, to his boutique investment bank."
## [297] "At noon on Friday a handful of people performed a mock lynching across the street from the Old Courthouse in SaintLouis as a silent protest of the police killings of Michael Brown and Vonderrit D. Myers Jr."
## [298] "Last week was a rough one for the Federal Reserve. | Paul J. Taubman continues to poach talent from Morgan Stanley, his former employer. | Mathew Martoma has started his nine year prison term for insider trading."
## [299] "The Chapter 11 filing by Aereo prompts a larger question of whether bankruptcy courts should actively police debtors motives for filing cases, writes Stephen J. Lubben in the In Debt column."
## [300] "The Fed has to balance the conflicting roles it plays as an overseer of the banks and as a law enforcement agency pursuing misconduct, Peter J. Henning writes in the White Collar Watch column."
## [301] "Speaker John A. Boehner announced that he was reappointing Representative Trey Gowdy, Republican of SouthCarolina, as chairman of the special committee on Benghazi for the 114th Congress that begins in January."
## [302] "Senator Elizabeth Warrens wrath toward Antonio F. Weiss is misdirected, and her understanding of the inversion deal on which she bases her opposition appears misinformed."
## [303] "Senator Charles E. Schumer said Tuesday that it was a political mistake to pass the Affordable Care Act in 2010 because voters at the time were looking for relief from the recession not universal health care."
## [304] "Photos from Egypt, HongKong, Kenya and the USA."
## [305] "The return of Michael S. Maurer."
## [306] "Thanksgiving was my Korean familys annual recommitment ceremony to the USA."
## [307] "The potential political costs of the civil unrest in Ferguson, Mo.; speculation about Chuck Hagels replacement; and a look at Speaker John A. Boehners turkey brine recipe."
## [308] "StevenACohens ex-wife gets outside financing for her lawsuit. | Poison puts develop a downside. | Trustee moves to repay Mt. Gox creditors. | Philip A. Falcone to Depart the Harbinger Group."
## [309] "Photos from HongKong, China, Syria and the USA."
## [310] "In Newport, R.I., visitors can tour storied mansions decorated for Christmas."
## [311] "Michael B. Jordan and John Turturro will take part in a reading of the script for Spike Lees Do the Right Thing"
## [312] "David W. Dunlap, a Metro reporter and the writer of the Building Blocks column, explored the sometimes close connection between Mr. Castro and NewYorkTimes."
## [313] "A wide open grid from Elizabeth C. Gorski."
## [314] "Celebrate the holidays with Elizabeth C. Gorski."
## [315] "Programs, and payouts, to encourage whistle-blowers to come forward with wrongdoing are likely to increase with the new Congress, Peter J. Henning writes."
## [316] "Jeh C. Johnson, the Homeland Security secretary, is prepared to defend the presidents decision to take executive action to overhaul the immigration system."
## [317] "Photos from HongKong, Paris, SouthKorea and the USA."
## [318] "The reporters J. David Goodman and Michael Wilson described a suburban mothers descent to a heroin addict whose home on Staten Island served as a drug-dealing center."
## [319] "Chief Justice John G. Roberts Jr. is well known for his ability to quote obscure case law, but on Monday he invoked the rapper Eminem at the Supreme Court."
## [320] "Mary L. Landrieu, a three term senator, is in the race of her career against the Republican Bill Cassidy as Louisiana voters head to the polls for a runoff vote on Dec. 6."
## [321] "Bank of NewYork Mellon said on Tuesday that it would give a board seat to Edward P. Garden of the hedge fund Trian Fund Management, forestalling a potentially costly proxy fight."
## [322] "The deal for IndCor Properties will make Singapores sovereign wealth fund a major owner of warehouses and distribution centers in the USA."
## [323] "Speaker John A. Boehner on options that Republicans are considering to block President Obamas immigration action."
## [324] "Senator Joseph R. McCarthy was censured 60 years ago today."
## [325] "Jonah M. Kessel filmed a documentary video in China and Myanmar, where jade mines have become a free market for heroin, and tells of the story of an addict he encountered."
## [326] "In a spellbinding letter to Jim Cramer, the activist investor J. Carlo Cannell urged him to cut his pay 70 percent, resign from CNBC and direct his energy to helping your fellow shareholders crawl back from Hades."
## [327] "The Fox affiliate in NewOrleans is taking issue with the use of its news anchors words in a campaign ad for Senator Mary L. Landrieu."
## [328] "NewYorks chokehold case has thrown a volatile element into Loretta E. Lynchs confirmation hearings as USA attorney general."
## [329] "Senator Elizabeth Warrens opposition to a nominee for a role at the Treasury Department demonstrates her blatant political motivation, William D. Cohan writes in the Street Scene column."
## [330] "J. Carlo Cannell wants Jim Cramer to sell TheStreet or take a pay cut and quit CNBC. A sale may be wishful thinking, says Quentin Webb of Reuters Breakingviews. But Mr. Cramers payout is hard to justify."
## [331] "The appellate division of the NewYork State Supreme Court dismissed a lawsuit at the heart of a bitter dispute between the billionaire collector Ronald O. Perelman and the art dealer Larry Gagosian."
## [332] "Photos from Russia, SouthAfrica, India and the USA."
## [333] "Andrea C. Bonomi, the Italian businessman fighting a Chinese investor for Club Mditerrane, raised his bid to $29.65 a share for the French resort operator."
## [334] "President Obama on Friday announced his selection of Ashton B. Carter to lead the Pentagon, White House officials said, embracing a physicist and national security centrist who may advocate a stronger use of American power."
## [335] "Jon Caramanica and A.O. Scott discuss what this update of The Bodyguard gets right about the music business."
## [336] "The famously outspoken, leather-clad architect is being honored this week with Design Miamis inaugural Design Visionary Award and an exhibition at the Bass Museum of Art. In this video, he discusses his career with T."
## [337] "An architect who hopes to save a Paul Rudolph-designed building in Orange County, N.Y., presented his detailed proposal to county leaders there on Friday."
## [338] "Demonstrations focused on fatal police encounters continued on Saturday, with a rally at the Louis H. Pink Houses in Brooklyn to protest the fatal shooting of Akai Gurley there and a grand jurys decision not to indict an officer in the death of..."
## [339] "Will the SECs treatment of Bank of America, Peter J. Henning asks in the White Collar Watch column, become the norm for other banks seeking waivers from rules on bad actors?"
## [340] "After years of research, there is genuine hope for reducing the toll exacted by lung cancer, which is the leading cause of cancer deaths in the USA."
## [341] "Senator Charles E. Schumer of NewYork may be Republicans favorite Democrat these days."
## [342] "Recommendations from the American Bankruptcy Institute failed to address derivatives held by nonfinancial institutions that will keep operating, writes Stephen J. Lubben in the In Debt column."
## [343] "Readers expressed outrage and despair in comments on a Friday editorial, It Wasnt Just the Chokehold: Eric Garner, Daniel Pantaleo and Lethal Police Tactics, and on an Op-Ed from Eric L. Adams, We Must Stop Abuse of Black Men."
## [344] "Jessica Bibliowicz will become chairwoman of the Cornell Weill Medical College, effective Jan. 1. Her father, Sanford I. Weill, has been chairman for two decades."
## [345] "The SecondMarket exchange won all but one of the 20 Bitcoin blocks in the governments auction. The venture capitalist Timothy C. Draper, who swept the first auction, won only 2,000 Bitcoins this time."
## [346] "Brian T. Moynihan tells a conference that his bank is expecting trading revenue in the fourth quarter to be down from last quarter and from a year earlier."
## [347] "The revival of A.R. Gurneys play will close this Sunday, before some of its scheduled stars ever get to perform."
## [348] "Jonathan Gruber, the M.IT economist who advised the Obama administration on the Affordable Care Act, apologized on Tuesday for inflammatory comments that have brought negative attention to the law in recent months."
## [349] "The Senate Intelligence Committee has released its long-awaited review on the torture of prisoners held by the Central Intelligence Agency during the George W. Bush administration."
## [350] "Just hours after announcing its $8.4 billion acquisition of Cubist Pharmaceuticals, a Delaware judge invalidated patents owned by the antibiotics maker. Merck appears to have been caught up in the M.&.A. exuberance, says Robert Cyran of Reuters..."
## [351] "Recent criticism of James J. Cramers compensation from TheStreet.com has painted him as another greedy WallStreet executive looking out for himself but not shareholders."
## [352] "Gov. Chris Christie of NewJersey, who was appointed USA attorney for the state by President George W. Bush in the weeks after the Sept. 11 attacks, seemed not eager on Wednesday to discuss the Senate report on the brutal American..."
## [353] "In this lesson we offer three different teaching ideas to engage students in the debate over immigration policy in the USA."
## [354] "The latest appeals court decision will make some insider-trading cases harder, but it does not give a free pass to hedge funds to trade on any confidential information, Peter J. Henning writes in White Collar Watch."
## [355] "The latest NewYorkTimes poll is based on telephone interviews conducted Dec. 4 through 7 with 1,006 adults throughout the USA."
## [356] "The Confucius Peace Prize was first given out in late 2010 as a rejoinder to the Nobel Peace Prize. President Vladimir V. Putin of Russia is a past recipient."
## [357] "Were going to continue to press his nomination forward, Treasury Secretary Jacob J. Lew said. He also spoke about corporate taxes and sanctions on Russia."
## [358] "George R. Goldner will become a private art adviser after 21 years heading the department. Nadine M. Orensteinhas been named to succeed him."
## [359] "Lloyd C. Blankfein, Goldman Sachss chief executive, addressed the antagonism that has been coming out of Washington."
## [360] "Francis J. Shammo, Verizons chief financial officer, said the company planned to continue to invest in its FiOS fiber-optic network and its wireless systems regardless of the outcome of the broadband debate."
## [361] "Senator Dianne Feinstein, chairwoman of the committee that investigated the CIAs interrogation program, offered a live rebuttal as John O. Brennan, the agencys director, responded to the committees findings."
## [362] "President Obama and Vice President Joseph R. Biden Jr. both made calls to Democratic lawmakers on Thursday in an urgent effort to save a $1.1 trillion spending bill that the White House and Republicans say is a worthy compromise."
## [363] "The Blackstone Group, the private equity giant run by Stephen A. Schwarzman, is almost finished raising a new fund for energy investments that is expected to exceed $4 billion in assets."
## [364] "OscarSSchafer will succeed the current NewYork Philharmonic chairman, Gary W. Parr, early next year."
## [365] "Friday and the weekend: a struggle for L.G.B.T. protections, more chilly weather, protests on Saturday, and the week in pictures."
## [366] "An A.C.L.U. video features military personnel who objected to the Bush-era torture program."
## [367] "Two new books provide insight into lesser-known stages of Charles M. Schulz and Jack Kirbys careers."
## [368] "On this day in 2000, the Supreme Court ruled 5 to 4 that there would be no additional counting of presidential votes in Florida. 35 days after Election Day, the decision effectively handed the presidency to George W. Bush."
## [369] "Hours after casting his final vote as a member of Congress, Representative John D. Dingell was hospitalized after taking a spill earlier in the week."
## [370] "Jungle Lady is the first single off the NewYork-based act Lion Babes self-titled debut EP."
## [371] "The Securities and Exchange Commission said it would not be able to proceed with the lawsuit because the two main witnesses it intended to call to testify are in Poland and had no plans to return to the USA."
## [372] "The return of Joel D. Lafargue, after a long break."
## [373] "Nominees that didnt make the final cut this year include Nine Inch Nails, Sting, N.W.A, Chic and Kraftwerk."
## [374] "The disturbing statistics on military rape and the fleeting attention of the public led Mary F. Calvert to document the stories of those who survived assaults."
## [375] "James E. Staley, a managing partner at the NewYork hedge fund BlueMountain Capital Management, is expected to be considered for the board during UBSs annual meeting in May, and to join the banks risk committee."
## [376] "Wednesday: The M.T.A. honors its workers, oddly warm weather, and a vigil for Pakistan."
## [377] "President Obama is scheduled to speak about relations with Cuba at 12 p.m. ET."
## [378] "Highlights and video of the remarks by President Raul Castro of Cuba on a diplomatic breakthrough with the USA."
## [379] "Speaker John A. Boehner on Wednesday called President Obamas new policy on Cuba the latest in a long line of mindless concessions to a dictatorship that brutalizes its people and schemed with our enemies."
## [380] "Alan P. Gross prepared his remarks at his attorneys office in Washington on Wednesday."
## [381] "Most Americans support normalizing trade and diplomatic relations with Cuba and consider what happens in the communist nation to be important to the interests of the USA."
## [382] "Photos from Pakistan, Australia, France and the USA."
## [383] "President Obama took the occasion of Alan P. Grosss release not only to acknowledge the interest the Jewish community had taken in Mr. Grosss case, but also to connect his release to the holiday and to Jewish teaching."
## [384] "Curtis L. Buser, who has been interim finance chief for the last seven months, is taking over the job permanently."
## [385] "Speaker John A. Boehner posted a holiday greeting on Thursday, a video poem title Happy Christmas to All, although the Speakers wishes may not extend all the way to the White House."
## [386] "Recommended books for learning more about Cubas history and its relationship to the USA."
## [387] "In a speech on Thursday, Benjamin M. Lawsky, NewYork States top financial regulator, provided the most detail yet on revisions to his proposed Bitcoin rules."
## [388] "The firm promoted John E. Waldron, one of its star deal makers, to become a co-head of its investment bank, succeeding a scion of one of Goldmans most famous families."
## [389] "The disposal is the latest move for Xerox, which has been transformed under the leadership of Ursula M. Burns into a provider of various business services."
## [390] "Speaker John A. Boehner has invited President Obama to address the country on Jan. 20 and give a State of the Union speech before a joint session of the new Republican-controlled Congress."
## [391] "Mark A. Flaherty and Mark O. Winkelman will serve on a board that now has 14 members."
## [392] "Goldman names John E. Waldron co-head of its investment bank. | Regulators deem MetLife too big to fail. | Another big whistle-blower reward in Bank of America case. | London tenants win battle over USA equity firm."
## [393] "After an investigation into conflicts of interest, William C. Erbey has agreed to step down from his position as chairman of Ocwen and four other related companies."
## [394] "Its about non-digital IT."
## [395] "Fred R. Conrad may be best known for his exquisite portraits, but an assignment in Kosovo taught him the value of watching and waiting for the story to come to him."
## [396] "An overhaul of the Freedom of Information Act did not survive the legislative deal-making that produced the spending plan. William D. Cohan laments that loss in the Street Scene column."
## [397] "Its hard to be FDR."
## [398] "Ocwen should look at Clayton Homes, whose credo puts customers first, writes Lawrence A. Cunningham in Another View."
## [399] "Photos from Ukraine, India, Nepal and the USA."
## [400] "NewYorkTimes reporter David W. Dunlap explains how coverage of the Bay of Pigs disaster unfolded."
## [401] "After decades of oppression of people who identify as L.G.B.T., the Cuban government now pays for gender reassignment surgery."
## [402] "Look for the hidden message from Peter A. Collins."
## [403] "The new year will bring cases from the subprime mortgage crisis, a Justice Department decision on an appeal of an insider trading ruling and, perhaps, new accounting frauds, Peter J. Henning writes in the White Collar Watch column."
## [404] "Under the F.D.A.s new policy, any man who has had sex with another man in the last year may not donate."
## [405] "Representative Michael G. Grimm of NewYork insisted last week that he would not resign from his seat. On Monday night, he changed his mind."
## [406] "Speaker John A. Boehner gave his seal of approval to Representative Michael Grimms resignation."
## [407] "David E. Sanger, chief Washington correspondent for NewYorkTimes, recalled a visit to the country 25 years ago."
## [408] "Former President George H. W. Bush was released from a Houston hospital where he has been staying since last week after experiencing shortness of breath."
## [409] "Speaker John A. Boehner expressed his support for Representative Steve Scalise, who spoke at a conference for white supremacists in 2002."
## [1] "Remaining Acronyms in Abstract:"
## [1] "In the 1864 election, Gotham delivered nearly twice as many votes to the presidents opponent, George B. McClellan."
## [2] "Researchers are finding more evidence that women who take S.S.R.I. depressants like Prozac and Zoloft increase the likelihood of a variety of health problems in their newborns."
## [3] "Eric T. Schneiderman, NewYorks attorney general, filed a suit on Tuesday that accuses Evans Bank of denying mortgages to African-Americans in Buffalo regardless of their credit."
## [4] "Jeffrey H. Knox, a senior federal prosecutor who butted heads with a number of WallStreet banks, is switching sides."
## [5] "In which Peter A. Collins disproves Aesop."
## [6] "Antonia M. Apps, the lead federal prosecutor in the criminal trial last year of Michael Steinberg, is taking a job with the law firm Milbank, Tweed, Hadley & McCloy."
## [7] "Scholarly musicians prove that Ph.D.s can play."
## [8] "In this new series, Chris Labzda and Bon Duke, the co-founders of the NewYork Fashion Film Festival, curate a short film each week for T. This weeks installment: A collage of classics overlaid with sound from Lauren Wolksteins Social Butterfly."
## [9] "I moved to Hudson, N.Y., to slow down and save money. I didnt know Id feel so lonely."
## [10] "A study suggests that exercise can help kids, especially those with A.D.H.D., focus in class."
## [11] "How should the USA respond to the killings of Steven J. Sotloff and James Foley?"
## [12] "Although banks are hiring armies of legal and compliance professionals in response to settlements and fines, they would do better to find ways to streamline their existing legal work, Geoffrey A. Moore and Mark Harris write in an Another View column."
## [13] "William T. Shermans 1864 campaign to take the Georgia city was one of the bloodiest of the Civil War."
## [14] "George Martin, author of the Song of Ice and Fire series, will be promoting his new book at the 92nd St. Y."
## [15] "The workers are OK."
## [16] "The Securities and Exchange Commission has chosen Tracey L. McNeil, an agency veteran and former corporate lawyer, to act as a liaison in resolving problems that retail investors may have with the agency."
## [17] "John A. Paulsons hedge fund suffered across-the-board losses in July and did only marginally better in August, according to an update to investors."
## [18] "Mr. Sheen will direct and perform in Dylan Thomass work he called it a play for voices on Oct. 26 at the 92nd Street Y."
## [19] "The acquisition is the latest move by Jeffrey R. Immelt, the chief of GE, to refocus the conglomerate on its core industrial businesses."
## [20] "The Chinese state news media has cited concerns that the USA is indoctrinating Chinese students by including its founding documents upholding freedom and human rights in the SAT."
## [21] "At the midpoint of the range, the Citizens Financial Group, based in Providence, R.I., would be valued at $13.4 billion."
## [22] "Mark P. Frissoras decision comes a few weeks after Carl C. Icahn disclosed an 8.48 percent stake in the rental car provider."
## [23] "Martin Lipton is virtually guaranteeing the continuation of policies opposed by many NYU faculty members, William D. Cohan writes in the Street Scene column."
## [24] "China Central Television used images of Condoleezza Rice in a report about a Beijing visit by Susan E. Rice, President Obamas national security adviser."
## [25] "If youve never heard the name Lloyd J. Austin, theres a good reason"
## [26] "This weeks video features Ismenia Mendes and David McElwee in a scene from A. R. Gurneys 1977 drama The Wayside Motor Inn."
## [27] "The media and financial data company Thomson Reuters is looking to sell peHUB, Buyouts and Venture Capital Journal, according to a report by peHUB."
## [28] "News headlines briefly flashed that Carl C. Icahn had raised his stake in Gannett, the media company, to nearly 9 percent. But as it turns out, his firm simply made a typo in a regulatory filing."
## [29] "Just two days after her USA Open victory, Serena Williams hosted her first ever fashion show for HSN."
## [30] "Nearly 70 years after they parted, a Chinese veteran is on a quest to find a Japanese woman he met shortly after World War II. His tale offers a warmer recollection of that era."
## [31] "In this lesson we offer a series of topics and questions paired with Times essays, articles, slide shows and videos to help students dig deeper into the causes, effects and overall legacy of World War I."
## [32] "A collection of material related to Rosa Parks, bought last month by Howard G. Buffett, is being lent to the Library of Congress for 10 years."
## [33] "A soldier takes time out of uniform to pursue an M.F.A. in writing, and discovers that what sets him apart, as a veteran at the keyboard, is not as important as the common ground he shares with other writers."
## [34] "Worn by the chef from July 29 to Aug. 4 while cooking at his two restaurants, Blue Hill in Manhattan and Blue Hill at Stone Barns in Pocantico Hills, N.Y."
## [35] "Gov. Andrew M. Cuomo and Mayor Bill de Blasio both enjoyed an Italian sausage at the Feast of SanGennaro in Little Italy on Saturday."
## [36] "Eike Batista was accused of manipulating the share price of his now-bankrupt petroleum company OGX."
## [37] "The activist investor William A. Ackmans new fund, Pershing Square Holdings, aims to list on the the Euronext Amsterdam exchange on Oct. 13."
## [38] "Fraud in the market for penny stocks continues unabated, two criminal cases filed last week show, Peter J. Henning writes in the White Collar Watch column."
## [39] "Quarries and connecting trenches were nothing less than small cities, where the walls bear witness to the people who fought and died in World War I."
## [40] "The deal for Vkontakte ends tensions between the social networks minority shareholders and Alisher B. Usmanov, the Russian billionaire who controls Mail.ru."
## [41] "Sears is borrowing $400 million from its chief, the billionaire Edward S. Lampert, through his hedge fund."
## [42] "KKRs purchase of Pioneers D.J. audio equipment business is the modern-day equivalent of providing gold-rush pickaxes to participants in the rave rush, Jeffrey Goldfarb writes in Reuters Breakingviews."
## [43] "Two grandchildren of Dwight D. Eisenhower say proposed changes by Frank Gehry to his memorial to the former president do not satisfy their concerns."
## [44] "In a letter to Electra Private Equity shareholders, the activist investor Edward J. Bramson said a change in approach in Electras strategy could increase the value of its shares by about $1.6 billion."
## [45] "Prosecutors in Brazil have argued that Mr. Batista profited from insider information when he sold shares in his now bankrupt petroleum company, OGX."
## [46] "Comments by Marshall L. Miller, the No. 2 official in the Justice Departments criminal division, reflect the agencys renewed interest in charging individual bank employees rather than just the banks."
## [47] "The top job at the database giant will be shared by Mark V. Hurd, now co-president, and Safra Catz, who is co-president and chief financial officer."
## [48] "Alibaba is set to start trading on the NewYork Stock Exchange under the ticker symbol BABA. | Markets showed relief that the United Kingdom would not face a tumultuous breakup. | Lawrence J. Ellison announced his retirement as chief executive of Oracle."
## [49] "Average waist circumference but not B.M.I. has increased significantly in the USA, a new study reports."
## [50] "The British private equity firm accused Edward J. Bramsons Sherborne Investors Management of making unverifiable statements and unsubstantiated claims in its effort to reshape Electra."
## [51] "Alibaba soared in its public market debut. | Siemens agreed to buy the Dresser-Rand Group. | EMC weighed a deal with H.P. | Public pension funds may be souring on hedge funds."
## [52] "Venezuela may represent the Latin American and Caribbean nations and sit right next to the USA."
## [53] "Speaker John A. Boehner seemed to blame lazy Americans for the stalled economy."
## [54] "It was unclear whether President Petro O. Poroshenko would travel to NewYork to deliver his address as scheduled on Thursday."
## [55] "As the reclusive genius Richard D. James releases his first album in 13 years, the graphic designer with whom he has frequently collaborated discusses their work together."
## [56] "Vice President Joseph R. Biden Jr. met Carina Castro, the daughter of Julin Castro, secretary of Housing and Urban Development, at a reception honoring Hispanic Heritage Month."
## [57] "Attorney General Eric H. Holder Jr. will announce Tuesday that the federal government will end its fiscal year next week with 4,800 fewer prisoners, the first time since 1980 that the inmate population has declined from year to year."
## [58] "President Obama could end up leaving a smaller imprint on the judiciary than either President Bill Clinton or President George W. Bush."
## [59] "Ever wonder why jewelry stores hide the price tags? How invoice factoring works for small businesses. And why German companies are on a buying spree in the USA."
## [60] "Speaker John A. Boehner urges the new Veterans Affairs secretary to look to the private sector in reviewing how the department delivers health care to veterans."
## [61] "On this day in 1952, Richard M. Nixon gave his famous Checkers speech."
## [62] "To kick off Paris Fashion Week, the chic native whose shirting-based collection is stocked at Barneys NewYork and Dover Street Market shares a list of her favorite places around town with T."
## [63] "Gov. Paul R. LePage of Maine wants to help the NFL address its domestic violence problem."
## [64] "Maurice R. Greenberg, the former chief of AIG, has now raised several million dollars from three WallStreet investors to help cover the cost of the case."
## [65] "Electra said that two influential shareholder advisory services opposed efforts of the activist investor Edward J. Bramson, through his Sherborne Investors Management, to shake up the board of the British private equity firm."
## [66] "Representative James A. Traficant was seriously injured after a tractor accident on his family farm Tuesday night, according to local media reports in Youngstown, Ohio."
## [67] "The role of former President Gerald R. Ford in the filing of a friend-of-the-court brief in a Michigan affirmative action case."
## [68] "The insider trading case against Michael A. Lucarelli, a former executive at Lippert/Heilshorn and Associates, was relatively minor. But he got attention for running barefoot from the court in August."
## [69] "Judge Richard M. Berman of Federal District Court in Manhattan, before sentencing conservative author Dinesh DSouza to probation for violating federal campaign finance laws."
## [70] "Speaker John A. Boehner, an avid golfer, is gearing up for the Ryder Cup, the intense competition between the USA and Europe that begins Friday in Scotland."
## [71] "Speaker John A. Boehner tells First Draft that the new Congress should debate military action in Syria."
## [72] "A new poll has Senator Mark R. Warner, the Democratic incumbent in Virginia, up by 9 points over his Republican opponent."
## [73] "Eric H. Holder Jr.s resignation as attorney general means that President Obama is losing one the longest-serving members of his cabinet and one of his closest confidants."
## [74] "The Department of Labor is putting money behind its push to expand paid family leave in the USA. The issue could be a factor in coming elections if enough voters ask about it."
## [75] "Members of the NewYork Police Department turned out in force on Thursday for the funeral of Officer Michael Williams in LaGrangeville, N.Y."
## [76] "How big a fight will President Obama put up over Attorney General Eric H. Holder Jr.s successor? The question is captivating Washington."
## [77] "Eric H. Holder Jr. has resigned as attorney general, but hes in no hurry to go anywhere."
## [78] "William H. Gross, who built Pimco into one of the largest asset managers in the world, will join Janus Capital after a decision had been made for him to leave Pimco or be forced out, said a person briefed on the matter."
## [79] "William H. Gross, for decades an investment guru, was undone by the increasing complexity of managing his ever-growing fund."
## [80] "Two of Allergans largest shareholders, T. Rowe Price and Pentwater Capital Management, have broken their silence to insist that the company not strike any deals before a scheduled special meeting in December."
## [81] "As Yahoo tries to figure out what to do after raising $6 billion from its stake in the Alibaba Group, a prominent investor, Starboard Value, has emerged to offer some suggestions including buying AOL."
## [82] "Speaker John A. Boehner says Republicans must compete in Northeast"
## [83] "The ruling by Judge Thomas P. Griesa of the Federal District Court in Manhattan allows Citigroup to make a $5 million payment to bondholders."
## [84] "Remembering James A. Trafficant, in videos and soundbites."
## [85] "The low-power computer server that has been seen as a way to revive Hewlett-Packard has never lived up to expectations. Still, HP keeps trying, this time with a chip made by ARM."
## [86] "William H. Gross abruptly leaves Pimco. | AIG case sheds new light on bailout. | DreamWorks Animation in sale talks with SoftBank. | Allergan shareholders speak out."
## [87] "Judge Thomas P. Griesa of Federal District Court in Manhattan stopped short of issuing sanctions, saying he would make a decision about them in the future."
## [88] "A suit filed by Stephen A. Wynn, the casino impresario, against an investor he accuses of slander is the latest salvo in the growing tensions between vocal shareholders and corporations."
## [89] "Kyle T. Dolan spins us a puzzle."
## [90] "The move, which the activist investor Carl C. Icahn had called for, will cleave eBay almost in half and separate it from a company that generates almost half its revenue."
## [91] "The team behind the popular Lower East Side restaurant are publishing a collection of 100 seasonal dishes meant to provide both information and inspiration. Here, they share a recipe with T."
## [92] "Elizabeth C. Gorski gives us the lowdown."
## [93] "The British media and marketing firms acquisition of Advanstar, a private company, would create the largest events organizer in the USA."
## [94] "The initial public offering of Pershing Square Holdings is expected to give the activist investor William A. Ackman a permanent pool of capital to make bigger, bolder bets."
## [95] "James R. Clapper Jr., the director of national intelligence, sent a message to employees on Tuesday defending the nations spy agencies against criticism."
## [96] "David A. ONeil has not announced his next step, though he will probably swing through Washingtons revolving door."
## [97] "A Hungarian musicologist finds Mozarts own score of the Piano Sonata in A, K. 331."
## [98] "After 20 years on the board of the Alvin Ailey Dance Foundation, Joan H. Weill, its chairwoman, is expected to announce Thursday that she is stepping down at the end of the year."
## [99] "Several recent bankruptcy cases and news events have shown that municipal and corporate debt securities have faced issues in pricing and trading, Stephen J. Lubben writes in an In Debt column."
## [100] "The countrys largest banks are rolling out a new service that could help them lessen their reliance on expensive data terminals like those sold by Bloomberg L.P."
## [101] "What White House security breaches happened during the presidencies of Franklin D. Roosevelt, Herbert Hoover and Ronald Reagan?"
## [102] "Politicians and NewYorks elite gathered Wednesday for the Alfred E. Smith charity dinner and roast."
## [103] "George W. Bush talks about his brother Jebs presidential aspirations."
## [104] "Michael R. Bloomberg and two Disney princesses."
## [105] "Former President George W. Bush said on Thursday that he had been lobbying his younger brother Jeb to throw his hat in the ring in 2016"
## [106] "The government said the criminal charges against Michael J. Coscia, founder of Panther Energy Trading, were the first to be brought under new rules that bar a type of abusive trading called spoofing."
## [107] "The Solomon R. Guggenheim Museum is planning the construction of a new building somewhere in NewYork City that will be used for offices, art storage and some public programming."
## [108] "Speaker John A. Boehner has picked Elise Stefanik, who is running for an open House seat in upstate NewYork, to deliver the partys radio address on Saturday."
## [109] "As the billionaire Kenneth C. Griffin battles his wife in divorce court, he has drawn back the curtain on one of the most prominent marriages in hedge fund world by disclosing terms of their prenuptial agreement."
## [110] "Jesse C. Litvak had been one of the few people convicted of fraud over the bailout of WallStreet, but an appeals court suggested that his conviction was likely to be overturned."
## [111] "Before my son started school, I shared my parents hippie view of the P.T.A. as a perfect-parent filled hassle and just another part of The Establishment trying to squelch creativity out of learning."
## [112] "Vice President Joseph R. Biden Jr. held a question and answer session at Harvard on Thursday."
## [113] "The investor Edward J. Bramson had been pushing to shake up the board of the British private equity firm Electra Private Equity and had sought two seats on its board."
## [114] "Including hard terms for AIG was politically necessary to getting the TARP program going, but Henry M. Paulson Jr. said he did support the bailout package and its terms."
## [115] "These D.I.Y. magazines are on the rise in LosAngeles, where artists are using them as a showcase for their work."
## [116] "A growing body of research indicates that many people who react to gluten may be suffering a condition called non-celiac gluten sensitivity, or NCGS."
## [117] "A big hurdle in the spoofing case against a high-frequency trading firm is that a jury must decide whether one computer fooling another is a crime, Peter J. Henning writes in the White Collar Watch column."
## [118] "Just 24 percent of respondents to a Pew Research Center poll correctly picked Janet L. Yellen as the Feds chairwoman from a list of four names."
## [119] "Maurice R. Greenberg, AIGs former chief and a large shareholder, has spun a ludicrous tale in court that the bailout of the insurer was unfair to its investors."
## [120] "Duncan L. Niederauer, who recently retired as head of the NewYork Stock Exchange, has joined an upstart brokerage firm called Battery East, which aims to help employees of privately held companies sell their shares."
## [121] "Big banks face another round of USA charges. | Henry M. Paulson Jr. testifies that punitive AIG terms were necessary. | Investors cheer the breakup of HP. | Lawyers for Goldman and Libyas sovereign fund clash in court."
## [122] "Timothy F. Geithner, the former Treasury secretary, was a witness in the trial of a lawsuit over the departments role in the bailout of AIG His book was under scrutiny, too."
## [123] "By chronicling the building blocks of technological advances, Walter Isaacsons The Innovators hopes to teach us about the nature of innovation, Jonathan A. Knee writes in a review, the debut Book Entry column."
## [124] "President Obama continues to get low marks on his handling of the threat from the Islamic State, also known as ISIS or ISIL."
## [125] "Valeant and Pershing Square are planning to raise their offer for Allergan. | Profit at Goldman less easy to find. | AIG trial puts Timothy F. Geithner on the hot seat. | Glencores chief pursues Rio Tinto."
## [126] "A derivatives rule is set to change. | JPMorgans data breach causes alarm. | Chinese companies are scooping up real estate across the globe. | Former Treasury Secretary Timothy F. Geithner defends the bailout of AIG"
## [127] "The $2.6 billion deal, which includes debt, is the latest transaction to reshape the drug industry. Auxilium is terminating its deal to acquire QLT."
## [128] "Carl C. Icahn, the billionaire activist investor, sent a letter to Timothy Cook, the chief executive of Apple, saying that the company is hugely undervalued."
## [129] "Critics want a war memorial to remove a sculpture based on Alfred Eisenstaedts famous Life Magazine photo of a sailor kissing a nurse at the end of World War II."
## [130] "One way to level the playing field is to simply repeal the safe harbors in the bankruptcy code, writes Stephen J. Lubben in the In Debt column."
## [131] "Facebook has a new local mobile advertising play. J.C. Penney is in the midst of an e-commerce renaissance."
## [132] "The AM1 Supreme by MARCH LA.B bears the hallmarks of vintage watches, but with a streamlined aesthetic."
## [133] "The government acknowledged there was scant legal precedent for its demand for $1.6 million in restitution from Jesse C. Litvak, who was convicted of securities fraud in March."
## [134] "Ben S. Bernanke, the former Federal Reserve chairman, kept his answers brief on the stand in the lawsuit over the 2008 bailout of American International Group."
## [135] "A sluggish global outlook sends ripples through the markets. | Tuning up UBSs investment bank. | Symantec announces a split. | Ben S. Bernanke defends the AIG bailout."
## [136] "The business will merge with a fledgling firm founded by Paul J. Taubman, the former Morgan Stanley investment banker, who will run the combined firm as chief executive and chairman."
## [137] "If you want to time-travel to the 1990s, youll get a chance at 1 p.m. today, when the William J. Clinton Library releases 10,000 more pages of previously undisclosed documents from the Clinton years."
## [138] "Field + Supply, founded by the interior designer Brad Ford, launches this weekend in a barn in High Falls, N.Y. Here, a look at some of the furniture and objects on offer."
## [139] "Ben Ratliff and Jon Caramanica discuss the singers Tinashe and FKA twigs, and whether they point toward a new conception of R&B."
## [140] "The Army War College rescinded the masters degree of Senator John E. Walsh on Friday, determining that the Montana Democrat plagiarized his final paper there in 2007."
## [141] "A restaurant learns what happens when you tell customers to pay what God wants them to pay, Microsofts chief executive says its O.K. for women to ask for a raise, and new restaurants are making Detroit a culinary oasis."
## [142] "A new USAO. lounge for service members and their families has opened at Terminal 5 at Kennedy Airport in NewYork."
## [143] "Vice President Spiro T. Agnew resigned on this day in 1973."
## [144] "Holly Fallon, whose stage name is Dollicious, and Miriam Hintz, who goes by Haru, posed in the lower courtyard of the Jacob K. Javits Convention Center on Friday at NewYork Comic Con."
## [145] "Two cases raise the question about what role the courts should play in policing negotiations and the limits that can be applied to the tactics one side can use in making a deal, Peter J. Henning writes in the White Collar Watch column."
## [146] "The awards will be announced on Nov. 23 in a show broadcast by ABC."
## [147] "A 28 year old digital communications director is helping to turn Speaker John A. Boehner, no ones idea of a digital-age guru, into a YouTube star."
## [148] "Abigail Johnson, 52, who is president of the parent company of Fidelity Investments, will succeed her father, Edward C. Johnson III, as chief executive."
## [149] "Photos from HongKong, Syria, Yemen and the USA."
## [150] "Adam G. Perl puns his way through our Tuesday puzzle."
## [151] "JPMorgan Chase posts a third quarter profit. | Warren E. Buffett promotes the Berkshire Hathaway brand. | Calculating the cost of Ebola | Derivatives change only goes so far."
## [152] "Denis J. McInerney, a former deputy assistant attorney general in the Justice Department, has returned to the law firm Davis Polk & Wardwell."
## [153] "President Obama has decided to wait until after next months midterm elections to nominate a replacement for Attorney General Eric H. Holder Jr., White House officials said."
## [154] "Some of it is OK."
## [155] "Highlights from the International Herald Tribune archives: Nikita S. Khrushchev was believed to be ousted as the Soviet leader in 1964."
## [156] "Hopper Drawing, by Carter E. Foster, was published by the Whitney Museum of American Art in connection with a Hopper exhibition in 2013."
## [157] "Airbnb, the pioneering home rental service, presents itself as useful and virtuous, but the reality is far less benign, according to Attorney General Eric T. Schneiderman of NewYork"
## [158] "The FBI director, James B. Comey, corrected statements he made on 60 Minutes that the bureau did not do electronic surveillance without a court order."
## [159] "Ben C. Solomon is a Times video journalist reporting on Ebola. His video today, about a team of ambulance drivers in Monrovia, Liberia, shows the dangers they face every day."
## [160] "James L. Amine and Timothy P. OHara have been appointed to the Swiss lenders executive board and will lead the investment banking division with Gal de Boissard, the chief executive for Europe, the Middle East and Africa."
## [161] "Food news from Silicon Valley to the UK."
## [162] "The venture capitalist Marc Andreessen is stepping down from eBays board, months after publicly defending his role as a director against attacks from Carl C. Icahn."
## [163] "Best known for his operas and symphonies, Mr. Glasss Piano tudes are the focus of a new recording and concerts at BAM."
## [164] "An SEC case against a high-frequency trading firm shows how difficult it is to draw the line between acceptable trading strategies and manipulation, Peter J. Henning writes in the White Collar Watch column."
## [165] "Vice President Joseph R. Biden Jr. appeared in a hangar near La Guardia Airport on Monday to endorse Gov. Andrew M. Cuomos plans for commissioning a redesign of four airports in and around NewYork City."
## [166] "Senator Patrick J. Leahy, chairman of the Judiciary Committee, is asking Comcast not to engage in paid prioritization of Internet content after its proposed takeover of Time Warner Cable."
## [167] "Tom Steyer, a billionaire hedge fund founder, has become the largest super PAC donor of all time, passing the casino magnate Sheldon G. Adelson."
## [168] "The piano superstar plays the solo cadenza for the first movement of Mozarts Piano Concerto No. 17 in G."
## [169] "If you are wondering why you have been getting fund-raising pitches from the Democratic Congressional Campaign Committee theres at least one explanation: an organization you did provide your email to has rented its list to the WashingtonDCC.C."
## [170] "Through a clever bit of photo manipulation, a new ad by the NRA puts Iowas Democratic Senate nominee, Bruce Braley, next to the man many gun owners consider their nemesis: Michael R. Bloomberg."
## [171] "Photos from SouthAfrica, HongKong, India and the USA."
## [172] "Vice President Joseph R. Biden Jr. beat the comics to the punch line at a recent Kennedy Center appearance."
## [173] "A federal judge ordered a full mental evaluation in the next 30 days of Omar J. Gonzalez, the man accused of jumping the fence and racing past Secret Service officers into the White House with a knife last month."
## [174] "Highlights from the International Herald Tribune archives: French farmers disagreed with General de Gaulles threat to quit the E.E.C. in 1964."
## [175] "The Feds policies that drove down interest rates to historically low levels have actually exacerbated the inequality problem that Janet L. Yellen said concerned her, writes William D. Cohan in Street Scene."
## [176] "Ben Bradlees funeral will be held Tuesday at the Washington National Cathedral, where Richard M. Nixon was remembered in a memorial service in 1994."
## [177] "NewYorkTimess David E. Sanger remembers a conversation with Ben Bradlee about an article in NewYorkTimes."
## [178] "At an investment conference, Scott A. Mather, who now oversees Pimcos Total Return Fund, defends its outsize holdings in European government bonds and other high-yielding securities."
## [179] "Children who drink rice, almond or soy milk instead of cows milk may have insufficient levels of vitamin D."
## [180] "A man later identified as a J.P. Morgan managing director walked into a live, online broadcast of a group of prominent protesters on Wednesday to vent his frustration with the continuing occupation of key locations in HongKong."
## [181] "Gov. Andrew M. Cuomo of NewYork foresees something really, really, really big for Hillary Rodham Clintons future."
## [182] "A hedge fund has emerged as perhaps the last hope for two struggling retails. | Bankers are jockeying for the next sovereign debt deal in Africa, despite plenty of risks. | Shareholders decide today whether to approve Chiquitas plan to acquire an Irish rival. | Charles T. Munger gives to theoretical physics."
## [183] "Richard Norton Smiths biography of Nelson Rockefeller is a portrait of the world when establishment financiers earned political respect, writes Jonathan A. Knee in a book review."
## [184] "A senior administration official confirms that Kathyrn Ruemmler, the former White House counsel, has taken herself out of the running to succeed Eric H. Holder Jr. as attorney general."
## [185] "The festival of theater, dance and performance art presented by P.S. 122 will take place at spaces around the city, while its home on First Avenue is renovated."
## [186] "Ben C. Solomon has been making videos about the Ebola outbreak, finding scenes of courage even as people struggle to contain the disease."
## [187] "Most European banks pass stress test. | Steven Ballmer could claim huge tax benefits on his LosAngeles Clippers deal. | William A. Ackmans outsize bets. | Court ruling disarms shareholders."
## [188] "Arguably the greatest feat of arms in American military history was performed on the Roanoke River at Plymouth, N.C., in the predawn darkness of Oct. 28, 1864."
## [189] "One side threatens to crack down harder; the other side complains about too much enforcement. The question is whether both sides can be satisfied, Peter J. Henning writes in the White Collar Watch column."
## [190] "Plus, El Anatsuis shimmering curtains, Martin Z. Marguliess photography collection and more art events in the week ahead."
## [191] "Vice President Joseph R. Biden campaigns for Representative Bruce Braley in Iowa on Monday."
## [192] "D.J. Tim Sweeneys cult-favorite program turns 15 next year. To celebrate, he has compiled a new double album, a song from which premieres here."
## [193] "William J. Burns, who just stepped down as deputy secretary of state, is being named president of the Carnegie Endowment for International Peace."
## [194] "Timothy D. Cook, Apples chief executive, said that one million credit cards had been activated on Apple Pay in the first three days that the mobile payment system was live."
## [195] "The information of more than 18.5 million California residents was compromised in 2013, according to Kamala D. Harris, Californias attorney general."
## [196] "An Apple executive recently called the design of Xiaomis phones theft. In return, Mr. Barra pointed out that parts of Apples new phones are a bit like handsets from HTC."
## [197] "Elizabeth C. Gorski cooks up a funny midweek puzzle for us."
## [198] "Michael E. Shapiro, who transformed and enlarged the collection of Atlantas High Museum, will leave next year."
## [199] "Sales of Gov. Andrew M. Cuomos memoir fell by more than 43 percent to 535 copies in its second week on shelves."
## [200] "Kering announced a new sustainable fashion program and the C.F.D.A. and Lexus announced the winners for their Eco-Fashion challenge."
## [201] "Quinn Bradlee, a son of Benjamin C. Bradlee, the former executive editor of The Washington Post, rested his head on the coffin after he spoke at his fathers funeral at Washington National Cathedral on Wednesday."
## [202] "He had claimed that the game publisher Activision Blizzard used his image without his permission in Call of Duty: Black Ops II."
## [203] "Senator Angus King, the Maine independent who had backed another independent, Eliot Cutler, for governor, switched his allegiance to the Democrat, Representative Michael H. Michaud."
## [204] "Photos from Turkey, China, India and the USA."
## [205] "Thomas M. Menino, who served as Bostons mayor for 20 years, has died at the age of 71."
## [206] "Neil L. Rudenstine has presided over the library along with Anthony W. Marx, its president during planning for the institutions controversial renovation."
## [207] "Tim Cooks public announcement that he is gay has been met with support, but some point out that the battle for L.G.B.T. equality at work is far from over."
## [208] "David W. Dunlap has worked at NewYorkTimes for 39 years. He recalls the early news coverage of AIDS and compares it to NewYorkTimess coverage of the arrival of the Ebola virus in the USA."
## [209] "Democratic nostalgia seemed to be the theme of the night as former President Bill Clinton campaigned on Thursday for Gov. Andrew M. Cuomo."
## [210] "The French banks results were a turnaround from the previous quarter, when it took a charge of nearly 6 billion euros for legal penalties in the USA."
## [211] "Albert G. Horvath, the Smithsonian Institutions current senior finance official, will serve as secretary until David J. Skorton can take up his permanent role in July."
## [212] "James B. Stewart provides insight into his column about the Apple chief executives announcement that he is proud to be gay."
## [213] "The return of Janet R. Bender."
## [214] "On Tuesday, Michael S. Dell will try to persuade people that his company is about far more than the personal computers and computer servers it has been known for, with products intended for things as varied as the cloud computing networks of global enterprises and handy personal devices."
## [215] "The government has imposed billions of dollars in penalties on the big banks for wrongdoing. But, Peter J. Henning writes in the White Collar Watch column, that may not be enough to change banks behavior."
## [216] "C.H.C.M., the influential Bond Street shop, is debuting an in-house range of refined wardrobe essentials."
## [217] "Photos from Turkey, Syria, India and the USA."
## [218] "Relations between tech companies and the intelligence community has been strained recently, but Adm. Michael S. Rogers played down government concerns."
## [219] "Highlights from the International Herald Tribune archives: Europeans were happy Lyndon B. Johnson was elected in 1964."
## [220] "Millstein & Company, the financial advisory firm founded by James E. Millstein, has hired Mark Walker, a former banker with Rothschild."
## [221] "Despite efforts to improve disclosure under the Freedom of Information Act, the SEC continues to resist sharing data with the public, William D. Cohan writes in the Street Scene column."
## [222] "William A. Ackmans hedge fund could vote its stake in Allergan in support of ousting directors at a meeting scheduled for Dec. 18, bolstering his effort to force the company into a sale."
## [223] "Some post-shellacking advice for the POTUS."
## [224] "Photos from Israel, Algeria, Russia and the USA."
## [225] "John L. Thornton, the chairman of Barrick Gold and a professor at Tsinghua University in Beijing, was designated as a possible successor to Henry M. Paulson Jr. when they were at Goldman Sachs."
## [226] "A. Jerrold Perenchio, the former chairman of Univision, pledged to donate about 50 artworks including notable Impressionist paintings "
## [227] "Rita Doves poem November for Beginners, and the article Heavy Autumn Snowstorm Barrels Across Northeast, by Al Baker, Elizabeth A. Harris and Sarah Maslin Nir, appear in this pairing."
## [228] "An emboldened Speaker John A. Boehner warned President Obama against unilateral action to overhaul the immigration system."
## [229] "The 82 year old has been recording with her daughter, Patsy L. Russell, and John Carter Cash, for a project combining new songs and old folk and gospel tunes."
## [230] "The designation of MetLife as systemically important is a sign that regulators are looking ahead to the next possible crisis, Stephen J. Lubben writes in the In Debt column."
## [231] "The Democratic Senatorial Campaign Committee cancels $2 million worth of ads on behalf of Senator Mary L. Landrieu of Louisiana."
## [232] "Photos from Ukraine, Syria, Scotland and the USA."
## [233] "Barry C. Silk writes puzzles for grown-ups, but once in a while you can find stuff for less mature adults like me."
## [234] "The Sony Pictures Classics co-president will take over from Herbert S. Schlosser, who will be chairman emeritus."
## [235] "Vice President Joseph R. Biden Jr. spoke quickly on his relationship with Prime Minister Benjamin Netanyahu of Israel during remarks to the Jewish Federation."
## [236] "Mr. Morrison will play J.M. Barrie in the show about the creation of Peter Pan; he and Jeremy Jordan have both played the role in earlier productions."
## [237] "The term political correctness is perhaps most often used by its detractors but new research has found that thinking about being P.C. can actually improve creativity."
## [238] "Senator Patrick J. Leahy said he believes Loretta E. Lynch, President Obamas choice for attorney general, should be confirmed easily no matter when the Senate votes."
## [239] "By analyzing samples from each of the five NewYork City boroughs, Jane M. Carlton hopes to detect viruses before an outbreak."
## [240] "Is the food label \"natural\" misleading to consumers? Should the F.D.A. enforce a definition of the term?"
## [241] "Another small sampling of sparkling prose in recent editions, from serious to silly to Kenny G."
## [242] "President Obama and President Vladimir V. Putin of Russia discussed Iran, Syria and Ukraine during three brief conversations at the Asia-Pacific Economic Cooperation summit meeting."
## [243] "Andrea C. Bonomi, the Italian businessman battling a Chinese French bid for Club Mditerrane, announces a sweetened offer for the resort company."
## [244] "Reporters were surprised on Tuesday morning when the Federalist Society, a conservative legal group, announced that the news media would be barred from an appearance by Justice Samuel A. Alito Jr. at its annual gala black-tie dinner."
## [245] "Heaven, the first offering from the Birmingham raised, LosAngeles-based 24 year old T.O.L.D., premieres here."
## [246] "The activist investor William A. Ackman has acquired an 8.5 percent economic interest in the former animal health arm of Pfizer."
## [247] "Photos from France, China, Poland and the USA."
## [248] "Both steps fall short of the big breakup of Dow Chemical being advocated by the activist hedge fund manager Daniel S. Loeb."
## [249] "Mr. Grammer is to play the largely comic role of Charles Frohman, the theater producer of Peter Pan playwright J.M. Barrie."
## [250] "Photos from Turkey, HongKong, Afghanistan and the USA."
## [251] "Steven A. Ballmer, Microsofts former chief executive, is making a financial contribution that will enable the school to expand its computer science faculty by about 50 percent."
## [252] "Dassault Systmes, a French company, has a five year research agreement with the F.D.A. focused on using 3 D technology for simulating the reliability of pacemaker wires."
## [253] "Its in the bullet points: Shoes, bags, more mobile and social media, and a focus on the USA."
## [254] "Berkshire Hathaway, the conglomerate run by Warren E. Buffett, will acquire Duracell using a transaction aimed at lowering the tax bill."
## [255] "President Obama and Prime Minister Dmitri A. Medvedev of Russia at the East Asia summit plenary session in Naypyidaw, Myanmar, on Thursday."
## [256] "Readers wrote about a public reckoning for our wars in Iraq and Afghanistan and the sacrifices of our troops in comments on Daniel P. Bolgers Op-Ed, The Truth About the Wars."
## [257] "Warren E. Buffett has an appetite for large and creatively structured acquisitions. A look back at some of his recent deals."
## [258] "Thomas O. Kriegsmann is to be its director of programs, a new position that will include artist development projects."
## [259] "David W. Dunlap, a Metro reporter for NewYorkTimes, took us back to the moment when the paper stopped being produced with Linotype machines."
## [260] "Daniel S. Loeb, the activist hedge fund manager, has been pressing for a breakup of Dow Chemical and has been rebuffed in his attempt to name two directors."
## [261] "More than $40 million has been raised in honor of Joan H. Weill, who is stepping down as chairwoman of the Alvin Ailey Dance Foundation."
## [262] "Net neutrality could give Comcast a way out of its Time Warner Cable acquisition. | Warren E. Buffetts deal for Duracell satisfies a number of his deal-making preferences. | Virgin America to begin trading. | Goldman seeks to lure tech talent."
## [263] "Evan R. Chesler, the chairman of the law firm Cravath, Swaine & Moore, will take over as the library chairman from Neil L. Rudenstine."
## [264] "Finish Big and Succession, examine how entrepreneurs and corporate chieftains alike can be assessed on their departures, Jonathan A. Knee writes in a joint book review."
## [265] "Senator Mary L. Landrieus campaign has not hesitated to embrace her stance on the XL Pipeline ahead of her runoff election against Representative Bill Cassidy."
## [266] "Celebrities should not quit their day jobs, according to Andrew J. Ries."
## [267] "And what about that I.S.I. dreamboat Aasar Khan?"
## [268] "The alliance between the American and German companies is expected to strengthen Mercks oncology business in several important markets, including the USA."
## [269] "The animal health care company Zoetis, which has attracted the interest of the deal maker William A. Ackman, has made a deal of its own."
## [270] "The $66 billion acquisition by Actavis would be the largest deal this year and would thwart the takeover effort of Valeant and William A. Ackman."
## [271] "Representative Nancy Pelosi has a birthday wish for Speaker John A. Boehner."
## [272] "Insider trading cases are casting a spotlight on whether courts should defer to administrative agencies in determining criminal violations, Peter J. Henning writes in the White Collar Watch column."
## [273] "Representative Steve Israel joked about handing off the WashingtonDCC.C. chairmanship."
## [274] "Mayor Francis G. Slay said troops operating within the city of SaintLouis would have a secondary role in responding to any protests, and that police officers would be the ones dealing directly with demonstrators."
## [275] "What do you do when you are a billionaire and a movie buff? If you are Paul G. Allen, you turn the iconic theater you already own into a state-of-the-art venue."
## [276] "William A. Ackman and Valeant Pharmaceuticals lost their hostile battle to buy Allergan and still came away with a total of $2.6 billion."
## [277] "WallStreet is on pace for a big year of deal-making. | William A. Ackman may have lost Allergan, but he still came away a winner. | The Justice Department is weighing a civil suit against Angelo R. Mozilo, the former chief executive of Countrywide Financial. | USA plans second Bitcoin auction."
## [278] "A briefing book for the White House chief of staff, Denis R. McDonough, had a note from one of his children attached to it as he sat in a meeting on Ebola in the White House on Tuesday."
## [279] "Much like their leader, Warren Buffett, the chief executives of Berkshire Hathaways subsidiaries are a savvy group of acquirers. Lawrence A. Cunningham, in the Another View column, explains why."
## [280] "American Ballet Theater and the Segerstrom Center for the Arts in Costa Mesa, Calif., will open the American Ballet Theater William J. Gillespie School on the Segerstrom campus in September, both organizations announced on Tuesday."
## [281] "Assembling the tale of G.M.s faulty ignition switch, which led to the deaths of at least 13 people, took months of digging by a team of reporters and editors."
## [282] "Joseph P. Clancy, acting director of the Secret Service, is to testify before Congress."
## [283] "Thomas B. Leonardi will be joining the investment bank as a senior adviser, focusing on the insurance industry."
## [284] "The release of an interview Robert Rubin gave nine years ago offers a window into the thoughts of the rich and powerful, William D. Cohan writes in the Street Scene column."
## [285] "On Saturday, the Arcangel-composed series Dances for the Electric Piano will be staged for the first time in the USA. Listen to it here."
## [286] "Representative Paul D. Ryan, the new chairman of the Ways and Means Committee, said that a broad overhaul of the tax code depended on the administrations engagement."
## [287] "Gov. Bobby Jindal of Louisiana tried to cut off an immigration discussion with Chuck Todd of NBC."
## [288] "The country plans to scrap its state monopoly on the sale of salt, a move that would mark the end of a system that can be traced back to 685 B.C."
## [289] "Despite being about to receive his Ph.D. in physics from Harvard, Kevin Niu plans to eschew the academic life to follow his calling in the entertainment industry."
## [290] "Daniel K. Tarullo, the Fed governor, described to a Senate subcommittee potential rules that could restrict banks from some types of commodities operations."
## [291] "Speaker John A. Boehner addressed President Obamas immigration actions."
## [292] "Is the Federal Reserve up to the task of regulating financial institutions that are so large and complex? the chairman of a Senate panel asked William C. Dudley."
## [293] "Attorney General Eric H. Holder Jr. offered guidance to law enforcement agencies and urged protesters to be restrained."
## [294] "Vice President Joseph R. Biden Jr. had to beat a hasty retreat on Friday at the sight of an unruly crowd of Ukrainians in Kiev protesting the new government amid continuing unrest in that former Soviet republic."
## [295] "Paul J. Taubman is bringing Don Cornwell, a managing director who specializes in sports team deals, to his boutique investment bank."
## [296] "At noon on Friday a handful of people performed a mock lynching across the street from the Old Courthouse in SaintLouis as a silent protest of the police killings of Michael Brown and Vonderrit D. Myers Jr."
## [297] "Last week was a rough one for the Federal Reserve. | Paul J. Taubman continues to poach talent from Morgan Stanley, his former employer. | Mathew Martoma has started his nine year prison term for insider trading."
## [298] "The Chapter 11 filing by Aereo prompts a larger question of whether bankruptcy courts should actively police debtors motives for filing cases, writes Stephen J. Lubben in the In Debt column."
## [299] "The Fed has to balance the conflicting roles it plays as an overseer of the banks and as a law enforcement agency pursuing misconduct, Peter J. Henning writes in the White Collar Watch column."
## [300] "Speaker John A. Boehner announced that he was reappointing Representative Trey Gowdy, Republican of SouthCarolina, as chairman of the special committee on Benghazi for the 114th Congress that begins in January."
## [301] "Senator Elizabeth Warrens wrath toward Antonio F. Weiss is misdirected, and her understanding of the inversion deal on which she bases her opposition appears misinformed."
## [302] "Senator Charles E. Schumer said Tuesday that it was a political mistake to pass the Affordable Care Act in 2010 because voters at the time were looking for relief from the recession not universal health care."
## [303] "Photos from Egypt, HongKong, Kenya and the USA."
## [304] "The return of Michael S. Maurer."
## [305] "Thanksgiving was my Korean familys annual recommitment ceremony to the USA."
## [306] "The potential political costs of the civil unrest in Ferguson, Mo.; speculation about Chuck Hagels replacement; and a look at Speaker John A. Boehners turkey brine recipe."
## [307] "StevenACohens ex-wife gets outside financing for her lawsuit. | Poison puts develop a downside. | Trustee moves to repay Mt. Gox creditors. | Philip A. Falcone to Depart the Harbinger Group."
## [308] "Photos from HongKong, China, Syria and the USA."
## [309] "In Newport, R.I., visitors can tour storied mansions decorated for Christmas."
## [310] "Michael B. Jordan and John Turturro will take part in a reading of the script for Spike Lees Do the Right Thing"
## [311] "David W. Dunlap, a Metro reporter and the writer of the Building Blocks column, explored the sometimes close connection between Mr. Castro and NewYorkTimes."
## [312] "A wide open grid from Elizabeth C. Gorski."
## [313] "Celebrate the holidays with Elizabeth C. Gorski."
## [314] "Programs, and payouts, to encourage whistle-blowers to come forward with wrongdoing are likely to increase with the new Congress, Peter J. Henning writes."
## [315] "Jeh C. Johnson, the Homeland Security secretary, is prepared to defend the presidents decision to take executive action to overhaul the immigration system."
## [316] "Photos from HongKong, Paris, SouthKorea and the USA."
## [317] "The reporters J. David Goodman and Michael Wilson described a suburban mothers descent to a heroin addict whose home on Staten Island served as a drug-dealing center."
## [318] "Chief Justice John G. Roberts Jr. is well known for his ability to quote obscure case law, but on Monday he invoked the rapper Eminem at the Supreme Court."
## [319] "Mary L. Landrieu, a three term senator, is in the race of her career against the Republican Bill Cassidy as Louisiana voters head to the polls for a runoff vote on Dec. 6."
## [320] "Bank of NewYork Mellon said on Tuesday that it would give a board seat to Edward P. Garden of the hedge fund Trian Fund Management, forestalling a potentially costly proxy fight."
## [321] "The deal for IndCor Properties will make Singapores sovereign wealth fund a major owner of warehouses and distribution centers in the USA."
## [322] "Speaker John A. Boehner on options that Republicans are considering to block President Obamas immigration action."
## [323] "Senator Joseph R. McCarthy was censured 60 years ago today."
## [324] "Jonah M. Kessel filmed a documentary video in China and Myanmar, where jade mines have become a free market for heroin, and tells of the story of an addict he encountered."
## [325] "In a spellbinding letter to Jim Cramer, the activist investor J. Carlo Cannell urged him to cut his pay 70 percent, resign from CNBC and direct his energy to helping your fellow shareholders crawl back from Hades."
## [326] "The Fox affiliate in NewOrleans is taking issue with the use of its news anchors words in a campaign ad for Senator Mary L. Landrieu."
## [327] "NewYorks chokehold case has thrown a volatile element into Loretta E. Lynchs confirmation hearings as USA attorney general."
## [328] "Senator Elizabeth Warrens opposition to a nominee for a role at the Treasury Department demonstrates her blatant political motivation, William D. Cohan writes in the Street Scene column."
## [329] "J. Carlo Cannell wants Jim Cramer to sell TheStreet or take a pay cut and quit CNBC. A sale may be wishful thinking, says Quentin Webb of Reuters Breakingviews. But Mr. Cramers payout is hard to justify."
## [330] "The appellate division of the NewYork State Supreme Court dismissed a lawsuit at the heart of a bitter dispute between the billionaire collector Ronald O. Perelman and the art dealer Larry Gagosian."
## [331] "Photos from Russia, SouthAfrica, India and the USA."
## [332] "Andrea C. Bonomi, the Italian businessman fighting a Chinese investor for Club Mditerrane, raised his bid to $29.65 a share for the French resort operator."
## [333] "President Obama on Friday announced his selection of Ashton B. Carter to lead the Pentagon, White House officials said, embracing a physicist and national security centrist who may advocate a stronger use of American power."
## [334] "Jon Caramanica and A.O. Scott discuss what this update of The Bodyguard gets right about the music business."
## [335] "The famously outspoken, leather-clad architect is being honored this week with Design Miamis inaugural Design Visionary Award and an exhibition at the Bass Museum of Art. In this video, he discusses his career with T."
## [336] "An architect who hopes to save a Paul Rudolph-designed building in Orange County, N.Y., presented his detailed proposal to county leaders there on Friday."
## [337] "Demonstrations focused on fatal police encounters continued on Saturday, with a rally at the Louis H. Pink Houses in Brooklyn to protest the fatal shooting of Akai Gurley there and a grand jurys decision not to indict an officer in the death of Eric Garner."
## [338] "Will the SECs treatment of Bank of America, Peter J. Henning asks in the White Collar Watch column, become the norm for other banks seeking waivers from rules on bad actors?"
## [339] "After years of research, there is genuine hope for reducing the toll exacted by lung cancer, which is the leading cause of cancer deaths in the USA."
## [340] "Senator Charles E. Schumer of NewYork may be Republicans favorite Democrat these days."
## [341] "Recommendations from the American Bankruptcy Institute failed to address derivatives held by nonfinancial institutions that will keep operating, writes Stephen J. Lubben in the In Debt column."
## [342] "Readers expressed outrage and despair in comments on a Friday editorial, It Wasnt Just the Chokehold: Eric Garner, Daniel Pantaleo and Lethal Police Tactics, and on an Op-Ed from Eric L. Adams, We Must Stop Abuse of Black Men."
## [343] "Jessica Bibliowicz will become chairwoman of the Cornell Weill Medical College, effective Jan. 1. Her father, Sanford I. Weill, has been chairman for two decades."
## [344] "The SecondMarket exchange won all but one of the 20 Bitcoin blocks in the governments auction. The venture capitalist Timothy C. Draper, who swept the first auction, won only 2,000 Bitcoins this time."
## [345] "Brian T. Moynihan tells a conference that his bank is expecting trading revenue in the fourth quarter to be down from last quarter and from a year earlier."
## [346] "The revival of A.R. Gurneys play will close this Sunday, before some of its scheduled stars ever get to perform."
## [347] "Jonathan Gruber, the M.IT economist who advised the Obama administration on the Affordable Care Act, apologized on Tuesday for inflammatory comments that have brought negative attention to the law in recent months."
## [348] "The Senate Intelligence Committee has released its long-awaited review on the torture of prisoners held by the Central Intelligence Agency during the George W. Bush administration."
## [349] "Senator Dianne Feinstein, Democrat of California and chair of the Senate Intelligence Committee, headed to the Senate floor Tuesday to talk about the committees newly released report on the torture of prisoners held by the Central Intelligence Agency under the George W. Bush administration."
## [350] "Several Republican members of the Senate Intelligence Committee pushed back against the release of the declassified executive summary of the panels report on the torture of prisoners by the Central Intelligence Agency. They condemned the report on techniques used during the George W. Bush administration as partisan."
## [351] "Just hours after announcing its $8.4 billion acquisition of Cubist Pharmaceuticals, a Delaware judge invalidated patents owned by the antibiotics maker. Merck appears to have been caught up in the M.&.A. exuberance, says Robert Cyran of Reuters Breakingviews."
## [352] "Recent criticism of James J. Cramers compensation from TheStreet.com has painted him as another greedy WallStreet executive looking out for himself but not shareholders."
## [353] "Gov. Chris Christie of NewJersey, who was appointed USA attorney for the state by President George W. Bush in the weeks after the Sept. 11 attacks, seemed not eager on Wednesday to discuss the Senate report on the brutal American interrogation techniques used on terrorism suspects a decade ago."
## [354] "In this lesson we offer three different teaching ideas to engage students in the debate over immigration policy in the USA."
## [355] "The latest appeals court decision will make some insider-trading cases harder, but it does not give a free pass to hedge funds to trade on any confidential information, Peter J. Henning writes in White Collar Watch."
## [356] "The latest NewYorkTimes poll is based on telephone interviews conducted Dec. 4 through 7 with 1,006 adults throughout the USA."
## [357] "The Confucius Peace Prize was first given out in late 2010 as a rejoinder to the Nobel Peace Prize. President Vladimir V. Putin of Russia is a past recipient."
## [358] "Were going to continue to press his nomination forward, Treasury Secretary Jacob J. Lew said. He also spoke about corporate taxes and sanctions on Russia."
## [359] "George R. Goldner will become a private art adviser after 21 years heading the department. Nadine M. Orensteinhas been named to succeed him."
## [360] "Lloyd C. Blankfein, Goldman Sachss chief executive, addressed the antagonism that has been coming out of Washington."
## [361] "Francis J. Shammo, Verizons chief financial officer, said the company planned to continue to invest in its FiOS fiber-optic network and its wireless systems regardless of the outcome of the broadband debate."
## [362] "Senator Dianne Feinstein, chairwoman of the committee that investigated the CIAs interrogation program, offered a live rebuttal as John O. Brennan, the agencys director, responded to the committees findings."
## [363] "The Blackstone Group, the private equity giant run by Stephen A. Schwarzman, is almost finished raising a new fund for energy investments that is expected to exceed $4 billion in assets."
## [364] "OscarSSchafer will succeed the current NewYork Philharmonic chairman, Gary W. Parr, early next year."
## [365] "Friday and the weekend: a struggle for L.G.B.T. protections, more chilly weather, protests on Saturday, and the week in pictures."
## [366] "An A.C.L.U. video features military personnel who objected to the Bush-era torture program."
## [367] "Two new books provide insight into lesser-known stages of Charles M. Schulz and Jack Kirbys careers."
## [368] "On this day in 2000, the Supreme Court ruled 5 to 4 that there would be no additional counting of presidential votes in Florida. 35 days after Election Day, the decision effectively handed the presidency to George W. Bush."
## [369] "Hours after casting his final vote as a member of Congress, Representative John D. Dingell was hospitalized after taking a spill earlier in the week."
## [370] "Jungle Lady is the first single off the NewYork-based act Lion Babes self-titled debut EP."
## [371] "The Securities and Exchange Commission said it would not be able to proceed with the lawsuit because the two main witnesses it intended to call to testify are in Poland and had no plans to return to the USA."
## [372] "The return of Joel D. Lafargue, after a long break."
## [373] "Nominees that didnt make the final cut this year include Nine Inch Nails, Sting, N.W.A, Chic and Kraftwerk."
## [374] "The disturbing statistics on military rape and the fleeting attention of the public led Mary F. Calvert to document the stories of those who survived assaults."
## [375] "James E. Staley, a managing partner at the NewYork hedge fund BlueMountain Capital Management, is expected to be considered for the board during UBSs annual meeting in May, and to join the banks risk committee."
## [376] "Wednesday: The M.T.A. honors its workers, oddly warm weather, and a vigil for Pakistan."
## [377] "President Obama is scheduled to speak about relations with Cuba at 12 p.m. ET."
## [378] "Highlights and video of the remarks by President Raul Castro of Cuba on a diplomatic breakthrough with the USA."
## [379] "Speaker John A. Boehner on Wednesday called President Obamas new policy on Cuba the latest in a long line of mindless concessions to a dictatorship that brutalizes its people and schemed with our enemies."
## [380] "Alan P. Gross prepared his remarks at his attorneys office in Washington on Wednesday."
## [381] "Most Americans support normalizing trade and diplomatic relations with Cuba and consider what happens in the communist nation to be important to the interests of the USA."
## [382] "Photos from Pakistan, Australia, France and the USA."
## [383] "President Obama took the occasion of Alan P. Grosss release not only to acknowledge the interest the Jewish community had taken in Mr. Grosss case, but also to connect his release to the holiday and to Jewish teaching."
## [384] "Curtis L. Buser, who has been interim finance chief for the last seven months, is taking over the job permanently."
## [385] "Speaker John A. Boehner posted a holiday greeting on Thursday, a video poem title Happy Christmas to All, although the Speakers wishes may not extend all the way to the White House."
## [386] "Recommended books for learning more about Cubas history and its relationship to the USA."
## [387] "In a speech on Thursday, Benjamin M. Lawsky, NewYork States top financial regulator, provided the most detail yet on revisions to his proposed Bitcoin rules."
## [388] "The firm promoted John E. Waldron, one of its star deal makers, to become a co-head of its investment bank, succeeding a scion of one of Goldmans most famous families."
## [389] "The disposal is the latest move for Xerox, which has been transformed under the leadership of Ursula M. Burns into a provider of various business services."
## [390] "Speaker John A. Boehner has invited President Obama to address the country on Jan. 20 and give a State of the Union speech before a joint session of the new Republican-controlled Congress."
## [391] "Mark A. Flaherty and Mark O. Winkelman will serve on a board that now has 14 members."
## [392] "Goldman names John E. Waldron co-head of its investment bank. | Regulators deem MetLife too big to fail. | Another big whistle-blower reward in Bank of America case. | London tenants win battle over USA equity firm."
## [393] "After an investigation into conflicts of interest, William C. Erbey has agreed to step down from his position as chairman of Ocwen and four other related companies."
## [394] "Its about non-digital IT."
## [395] "Fred R. Conrad may be best known for his exquisite portraits, but an assignment in Kosovo taught him the value of watching and waiting for the story to come to him."
## [396] "An overhaul of the Freedom of Information Act did not survive the legislative deal-making that produced the spending plan. William D. Cohan laments that loss in the Street Scene column."
## [397] "Its hard to be FDR."
## [398] "Ocwen should look at Clayton Homes, whose credo puts customers first, writes Lawrence A. Cunningham in Another View."
## [399] "Photos from Ukraine, India, Nepal and the USA."
## [400] "NewYorkTimes reporter David W. Dunlap explains how coverage of the Bay of Pigs disaster unfolded."
## [401] "After decades of oppression of people who identify as L.G.B.T., the Cuban government now pays for gender reassignment surgery."
## [402] "Look for the hidden message from Peter A. Collins."
## [403] "The new year will bring cases from the subprime mortgage crisis, a Justice Department decision on an appeal of an insider trading ruling and, perhaps, new accounting frauds, Peter J. Henning writes in the White Collar Watch column."
## [404] "Under the F.D.A.s new policy, any man who has had sex with another man in the last year may not donate."
## [405] "Representative Michael G. Grimm of NewYork insisted last week that he would not resign from his seat. On Monday night, he changed his mind."
## [406] "Speaker John A. Boehner gave his seal of approval to Representative Michael Grimms resignation."
## [407] "David E. Sanger, chief Washington correspondent for NewYorkTimes, recalled a visit to the country 25 years ago."
## [408] "Former President George H. W. Bush was released from a Houston hospital where he has been staying since last week after experiencing shortness of breath."
## [409] "Speaker John A. Boehner expressed his support for Representative Steve Scalise, who spoke at a conference for white supremacists in 2002."
## [1] "Remaining #\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+# terms in Headline: "
## pattern .n
## 1 New Album 6
## 2 New Capital 4
## 3 New Director 4
## 4 New App 3
## 5 New Chief 3
## 6 New Creative 3
## 7 New Gig 3
## 8 New Musical 3
## 9 New President 3
## 10 New British 2
## 11 New Building 2
## 12 New Businesses 2
## 13 New Course 2
## 14 New Dance 2
## 15 New Edition 2
## 16 New Financing 2
## 17 New Firm 2
## 18 New Food 2
## 19 New GOP 2
## 20 New Head 2
## 21 New Hotel 2
## 22 New Inversion 2
## 23 New Life 2
## 24 New Light 2
## 25 New Listing 2
## 26 New Look 2
## 27 New One 2
## 28 New Plan 2
## 29 New Restaurant 2
## 30 New Rules 2
## 31 New Season 2
## 32 New Study 2
## 33 New Thing 2
## 34 New Tool 2
## 35 New Video 2
## 36 New Website 2
## 40 New Ad 1
## 41 New Adele 1
## 42 New Afghan 1
## 43 New Again 1
## 44 New Airport 1
## 45 New Albert 1
## 46 New Albums 1
## 47 New Answers 1
## 48 New Anti 1
## 49 New Apple 1
## 50 New Approach 1
## 51 New Arrival 1
## 52 New Arrivals 1
## 53 New Audience 1
## 54 New Autobiographies 1
## 55 New Autocorrect 1
## 56 New Badges 1
## 57 New Bag 1
## 58 New Band 1
## 59 New Banknotes 1
## 60 New Bears 1
## 61 New Big 1
## 62 New Bike 1
## 63 New Bill 1
## 64 New Biotech 1
## 65 New Birth 1
## 66 New Black 1
## 67 New Bond 1
## 68 New Book 1
## 69 New Broadway 1
## 70 New Broker 1
## 71 New Brooklyn 1
## 73 New Campaign 1
## 74 New Candidate 1
## 75 New Carpetbagger 1
## 76 New Cash 1
## 77 New Cause 1
## 78 New Chairman 1
## 79 New Charity 1
## 80 New Chart 1
## 81 New Chinese 1
## 82 New Classical 1
## 83 New Climate 1
## 84 New Club 1
## 85 New Co 1
## 72 New CO2 1
## 86 New Conflict 1
## 87 New Contemporary 1
## 88 New Crime 1
## 89 New Crowdsourced 1
## 90 New Crowns 1
## 91 New Culture 1
## 92 New Dealer 1
## 93 New Default 1
## 94 New Dell 1
## 95 New Design 1
## 96 New Designer 1
## 97 New Devices 1
## 98 New Dinosaur 1
## 99 New Diplomacy 1
## 100 New Downtown 1
## 101 New Duo 1
## 102 New Ensemble 1
## 103 New Entrant 1
## 104 New Event 1
## 105 New Exhibition 1
## 106 New Exhibitions 1
## 107 New FAA 1
## 108 New Face 1
## 109 New Ferguson 1
## 110 New Film 1
## 111 New Flights 1
## 112 New Floral 1
## 113 New Fordham 1
## 114 New Friends 1
## 115 New Front 1
## 116 New Fund 1
## 117 New Fundraising 1
## 118 New Galaxy 1
## 119 New Gender 1
## 120 New Guide 1
## 121 New Guidelines 1
## 122 New Halloween 1
## 123 New Hard 1
## 124 New Haunt 1
## 125 New Healing 1
## 126 New Health 1
## 127 New High 1
## 128 New Historical 1
## 129 New Hit 1
## 130 New Holiday 1
## 131 New Home 1
## 132 New Ideas 1
## 133 New Image 1
## 134 New Information 1
## 135 New Institute 1
## 136 New Japanese 1
## 137 New Jonathan 1
## 138 New Journalism 1
## 139 New Kennedy 1
## 140 New Kentucky 1
## 141 New Kids 1
## 142 New Kind 1
## 143 New LaBute 1
## 144 New Laura 1
## 145 New Leader 1
## 146 New Leaders 1
## 147 New Lease 1
## 148 New Level 1
## 149 New Loretta 1
## 150 New Lucerne 1
## 152 New Madonna 1
## 153 New Mamet 1
## 154 New Manager 1
## 155 New Marching 1
## 156 New Margaret 1
## 157 New Market 1
## 158 New Marylebone 1
## 159 New Measurements 1
## 160 New Messaging 1
## 161 New Mortgage 1
## 151 New MSG 1
## 162 New Multimedia 1
## 163 New Museum 1
## 164 New Names 1
## 165 New Nations 1
## 166 New Neutral 1
## 167 New Nick 1
## 168 New NorthAmerican 1
## 169 New Opposition 1
## 170 New Outpost 1
## 171 New Paper 1
## 172 New Partners 1
## 173 New Performance 1
## 174 New Pet 1
## 175 New Philanthropy 1
## 176 New Pizzeria 1
## 177 New Policy 1
## 178 New Poll 1
## 179 New Polls 1
## 180 New Pork 1
## 181 New Prime 1
## 182 New Privacy 1
## 183 New Product 1
## 184 New Productions 1
## 185 New Proposal 1
## 186 New Publishing 1
## 187 New Pumpkin 1
## 188 New Releases 1
## 189 New Research 1
## 190 New Resource 1
## 191 New Road 1
## 192 New Role 1
## 193 New Rule 1
## 195 New Sales 1
## 194 New SBA 1
## 196 New Scrutiny 1
## 197 New Series 1
## 198 New Shoe 1
## 199 New Short 1
## 200 New Show 1
## 201 New Site 1
## 202 New Smartphones 1
## 203 New Smog 1
## 204 New Sports 1
## 205 New Stadium 1
## 206 New Stands 1
## 207 New Starring 1
## 208 New Start 1
## 209 New Stephen 1
## 210 New Steps 1
## 211 New Studies 1
## 212 New Subscription 1
## 213 New Supporters 1
## 214 New Tablet 1
## 215 New Techniques 1
## 216 New Technologies 1
## 217 New Things 1
## 218 New Thoughts 1
## 219 New Tiananmen 1
## 220 New Tina 1
## 221 New Tome 1
## 222 New Track 1
## 224 New Unit 1
## 225 New Unofficial 1
## 223 New USA 1
## 226 New Views 1
## 227 New Wave 1
## 228 New Way 1
## 229 New Ways 1
## 230 New Wedge 1
## 231 New Whitney 1
## 232 New Work 1
## 233 New Worlds 1
## 234 New Wreaths 1
## 239 Saint-Gobain 1
## 240 St. Ann 1
## 241 St. Baldrick 1
## 242 St. Peter 1
## [1] " consider cleaning if relevant to problem domain; geography name; .n > 1"
## [1] "Remaining #\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+# terms in Snippet: "
## pattern .n
## 2 New Atheist 1
## 3 New Brain 1
## 4 New Centurys 1
## 5 New Cuban 1
## 6 New Deal 1
## 7 New Enterprise 1
## 8 New Era 1
## 9 New Health 1
## 10 New Rich 1
## 11 New Things 1
## 28 Saint-Gobain 1
## 29 St. Baldricks 1
## 30 St. Mary 1
## 31 St. Y 1
## [1] " consider cleaning if relevant to problem domain; geography name; .n > 1"
## [1] "Remaining #\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+# terms in Abstract: "
## pattern .n
## 2 New Atheist 1
## 3 New Brain 1
## 4 New Centurys 1
## 5 New Cuban 1
## 6 New Deal 1
## 7 New Enterprise 1
## 8 New Era 1
## 9 New Health 1
## 10 New Rich 1
## 11 New Things 1
## 28 Saint-Gobain 1
## 29 St. Andrew 1
## 30 St. Baldricks 1
## 31 St. Mary 1
## 32 St. Y 1
## [1] " consider cleaning if relevant to problem domain; geography name; .n > 1"
## [1] "Remaining #\\b(N|S|E|W|C)( |\\.)(\\w)+# terms in Headline: "
## pattern .n
## 1 C Card 1
## 2 E and 1
## 3 N Cheez 1
## 4 N Weaver 1
## 5 W Hotels 1
## [1] "Remaining #\\b(N|S|E|W|C)( |\\.)(\\w)+# terms in Snippet: "
## pattern .n
## 1 N.Y 5
## 2 C Concourse 1
## 3 C.F 1
## 4 C.H 1
## 5 C.L 1
## 6 C.M 1
## 7 E and 1
## 8 E is 1
## 9 E.E 1
## 10 N.C 1
## 11 N.W 1
## 12 S and 1
## 13 S.I 1
## 14 S.S 1
## 15 W Hotels 1
## 16 W Retreat 1
## [1] "Remaining #\\b(N|S|E|W|C)( |\\.)(\\w)+# terms in Abstract: "
## pattern .n
## 1 N.Y 5
## 2 C Concourse 1
## 3 C.F 1
## 4 C.H 1
## 5 C.L 1
## 6 C.M 1
## 7 E and 1
## 8 E is 1
## 9 E.E 1
## 10 N.C 1
## 11 N.W 1
## 12 S and 1
## 13 S.I 1
## 14 S.S 1
## 15 W Hotels 1
## 16 W Retreat 1
## [1] "Remaining #\\b(North|South|East|West|Central)( |\\.)(\\w)+# terms in Headline: "
## pattern .n
## 1 Central Bank 3
## 2 East Side 3
## 3 North West 2
## 4 West to 2
## 5 Central Bankers 1
## 6 Central Cast 1
## 7 Central Italy 1
## 8 Central Role 1
## 9 East Africa 1
## 10 East Berlin 1
## 11 East London 1
## 12 East Poland 1
## 13 East Rivals 1
## 14 East Spring 1
## 15 North Transfer 1
## 16 North Views 1
## 17 South Lawn 1
## 20 South of 1
## 18 South Pacific 1
## 19 South Street 1
## 21 South to 1
## 22 West 11th 1
## 23 West Africa 1
## 24 West African 1
## 25 West Coast 1
## [1] "Remaining #\\b(North|South|East|West|Central)( |\\.)(\\w)+# terms in Snippet: "
## pattern .n
## 1 East Side 12
## 2 Central Bank 8
## 3 West Africa 8
## 4 Central Intelligence 5
## 5 East Coast 5
## 6 West Bank 5
## 7 East and 4
## 8 West Coast 4
## 9 South Beach 3
## 10 West Side 3
## 11 East End 2
## 12 North Portico 2
## 13 South Africas 2
## 14 West African 2
## 16 West and 2
## 15 West Village 2
## 23 Central and 1
## 24 Central bankers 1
## 17 Central Banks 1
## 25 Central business 1
## 18 Central District 1
## 26 Central show 1
## 19 Central Television 1
## 20 Central Terminal 1
## 21 Central Time 1
## 22 Central Valley 1
## 27 East 55th 1
## 28 East Africa 1
## 29 East Asia 1
## 30 East Berliners 1
## 31 East Christians 1
## 41 East correspondent 1
## 32 East Flatbush 1
## 33 East German 1
## 34 East Germans 1
## 42 East haunts 1
## 35 East Hollywood 1
## 43 East in 1
## 36 East Jerusalem 1
## 37 East Poland 1
## 38 East River 1
## 39 East Room 1
## 40 East Timor 1
## 44 North African 1
## 45 North Americas 1
## 46 North Carolinas 1
## 47 North English 1
## 50 North gates 1
## 51 North to 1
## 48 North Vietnamese 1
## 49 North Water 1
## 58 South as 1
## 52 South Asia 1
## 53 South Asian 1
## 54 South Dakotas 1
## 55 South Florida 1
## 59 South in 1
## 60 South of 1
## 56 South Rhodesias 1
## 57 South Will 1
## 61 West 47th 1
## 62 West 50th 1
## 63 West Florrissant 1
## 67 West in 1
## 68 West is 1
## 64 West Rivers 1
## 65 West Sider 1
## 66 West Wing 1
## [1] "Remaining #\\b(North|South|East|West|Central)( |\\.)(\\w)+# terms in Abstract: "
## pattern .n
## 1 East Side 12
## 2 Central Bank 8
## 3 West Africa 8
## 4 Central Intelligence 6
## 5 East Coast 5
## 6 West Bank 5
## 7 East and 4
## 8 West Coast 4
## 9 South Beach 3
## 10 West Side 3
## 11 East End 2
## 12 North Portico 2
## 13 South Africas 2
## 14 West African 2
## 16 West and 2
## 15 West Village 2
## 23 Central and 1
## 24 Central bankers 1
## 17 Central Banks 1
## 25 Central business 1
## 18 Central District 1
## 26 Central show 1
## 19 Central Television 1
## 20 Central Terminal 1
## 21 Central Time 1
## 22 Central Valley 1
## 27 East 55th 1
## 28 East Africa 1
## 29 East Asia 1
## 30 East Berliners 1
## 31 East Christians 1
## 41 East correspondent 1
## 32 East Flatbush 1
## 33 East German 1
## 34 East Germans 1
## 42 East haunts 1
## 35 East Hollywood 1
## 43 East in 1
## 36 East Jerusalem 1
## 37 East Poland 1
## 38 East River 1
## 39 East Room 1
## 40 East Timor 1
## 44 North African 1
## 45 North Americas 1
## 46 North Carolinas 1
## 47 North English 1
## 50 North gates 1
## 51 North to 1
## 48 North Vietnamese 1
## 49 North Water 1
## 58 South as 1
## 52 South Asia 1
## 53 South Asian 1
## 54 South Dakotas 1
## 55 South Florida 1
## 59 South in 1
## 60 South of 1
## 56 South Rhodesias 1
## 57 South Will 1
## 61 West 47th 1
## 62 West 50th 1
## 63 West Florrissant 1
## 67 West in 1
## 68 West is 1
## 64 West Rivers 1
## 65 West Sider 1
## 66 West Wing 1
## [1] "Remaining compound terms in Headline: "
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 12344)>>
## Non-/sparse entries: 52654/103661634
## Sparsity : 100%
## Maximal term length: 34
## Weighting : term frequency (tf)
## [1] "nrow(cmpnd_Tf_df): 167"
## Tf.full term filter
## about-face 3 about-face FALSE
## cat-kicker 2 cat-kicker FALSE
## obama-clinton 2 obama-clinton FALSE
## one-handed 2 one-handed FALSE
## one-stop 2 one-stop FALSE
## spell-check 2 spell-check FALSE
## Tf.full term filter
## buffalo-area 1 buffalo-area FALSE
## hotel-room 1 hotel-room FALSE
## left-wing 1 left-wing FALSE
## million-seller 1 million-seller FALSE
## wide-eyed 1 wide-eyed FALSE
## yield-hungry 1 yield-hungry FALSE
## Tf.full term filter
## vista-tibco 1 vista-tibco FALSE
## vocabulary-expanding 1 vocabulary-expanding FALSE
## wd-50s 1 wd-50s FALSE
## welser-moumlst 1 welser-moumlst FALSE
## wide-eyed 1 wide-eyed FALSE
## yield-hungry 1 yield-hungry FALSE
## [1] "Remaining compound terms in Snippet: "
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 19432)>>
## Non-/sparse entries: 137415/163130249
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
## [1] "nrow(cmpnd_Tf_df): 2"
## Tf.full term filter
## twice-daily 1 twice-daily FALSE
## two-fisted 1 two-fisted FALSE
## [1] "Remaining compound terms in Abstract: "
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 19491)>>
## Non-/sparse entries: 137871/163625511
## Sparsity : 100%
## Maximal term length: 118
## Weighting : term frequency (tf)
## [1] "nrow(cmpnd_Tf_df): 2"
## Tf.full term filter
## twice-daily 1 twice-daily FALSE
## two-fisted 1 two-fisted FALSE
## label step_major step_minor bgn end
## 3 extract.features_process.text 3 0 42.855 105.036
## 4 extract.features_build.corpus 4 0 105.036 NA
## elapsed
## 3 62.181
## 4 NA
## [1] "Building glb_corpus_lst..."
## label step_major step_minor bgn end
## 4 extract.features_build.corpus 4 0 105.036 117.142
## 5 extract.features_extract.DTM 5 0 117.142 NA
## elapsed
## 4 12.106
## 5 NA
## [1] "Extracting TfIDf terms for Headline..."
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0)
## [1] "Extracting TfIDf terms for Snippet..."
## Warning in weighting(x): empty document(s): character(0) character(0)
## [1] "Extracting TfIDf terms for Abstract..."
## Warning in weighting(x): empty document(s): character(0) character(0)
## label step_major step_minor bgn end
## 5 extract.features_extract.DTM 5 0 117.142 131.219
## 6 extract.features_report.DTM 6 0 131.220 NA
## elapsed
## 5 14.077
## 6 NA
## [1] "Reporting TfIDf terms for Headline..."
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 9218)>>
## Non-/sparse entries: 44246/77405390
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## [1] " Sparse TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 28)>>
## Non-/sparse entries: 4546/230710
## Sparsity : 98%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
## colorcol_name = "in.sprs"): converting in.sprs to class:factor
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## [1] "Reporting TfIDf terms for Snippet..."
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 13694)>>
## Non-/sparse entries: 105089/114951899
## Sparsity : 100%
## Maximal term length: 25
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## [1] " Sparse TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 22)>>
## Non-/sparse entries: 8285/176559
## Sparsity : 96%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
## colorcol_name = "in.sprs"): converting in.sprs to class:factor
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## [1] "Reporting TfIDf terms for Abstract..."
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 13738)>>
## Non-/sparse entries: 105466/115321210
## Sparsity : 100%
## Maximal term length: 112
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## [1] " Sparse TermMatrix:"
## <<DocumentTermMatrix (documents: 8402, terms: 22)>>
## Non-/sparse entries: 8297/176547
## Sparsity : 96%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
## colorcol_name = "in.sprs"): converting in.sprs to class:factor
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## label step_major step_minor bgn end
## 6 extract.features_report.DTM 6 0 131.220 156.383
## 7 extract.features_bind.DTM 7 0 156.383 NA
## elapsed
## 6 25.163
## 7 NA
## [1] "Binding DTM for Headline..."
## [1] "Binding DTM for Snippet..."
## [1] "Binding DTM for Abstract..."
## label step_major step_minor bgn end elapsed
## 7 extract.features_bind.DTM 7 0 156.383 156.461 0.078
## 8 extract.features_bind.DXM 8 0 156.462 NA NA
## Warning in rm(log_X_df, txt_X_df): object 'log_X_df' not found
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# print(myplot_scatter(glb_trnobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
glb_full_DTM_lst, glb_sprs_DTM_lst, txt_corpus, txt_vctr)
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'corpus_lst' not found
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df, "extract.features_end",
major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 8 extract.features_bind.DXM 8 0 156.462 211.665 55.204
## 9 extract.features_end 9 0 211.668 NA NA
myplt_chunk(extract.features_chunk_df)
## label step_major step_minor bgn
## 3 extract.features_process.text 3 0 42.855
## 8 extract.features_bind.DXM 8 0 156.462
## 6 extract.features_report.DTM 6 0 131.220
## 5 extract.features_extract.DTM 5 0 117.142
## 4 extract.features_build.corpus 4 0 105.036
## 2 extract.features_factorize.str.vars 2 0 42.525
## 7 extract.features_bind.DTM 7 0 156.383
## 1 extract.features_bgn 1 0 42.515
## end elapsed duration
## 3 105.036 62.181 62.181
## 8 211.665 55.204 55.203
## 6 156.383 25.163 25.163
## 5 131.219 14.077 14.077
## 4 117.142 12.106 12.106
## 2 42.855 0.330 0.330
## 7 156.461 0.078 0.078
## 1 42.524 0.009 0.009
## [1] "Total Elapsed Time: 211.665 secs"
# if (glb_save_envir)
# save(glb_feats_df,
# glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
# file=paste0(glb_out_pfx, "extract_features_dsk.RData"))
# load(paste0(glb_out_pfx, "extract_features_dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "cluster.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 42.469 213.686 171.217
## 7 cluster.data 4 0 213.686 NA NA
4.0: cluster dataif (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
# glb_hash <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
# glb_hash_lst <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
print("Clustering features: ")
#print(cluster_vars <- grep("[HSA]\\.[PT]\\.", names(glb_allobs_df), value=TRUE))
print(cluster_vars <- grep("[HSA]\\.", names(glb_allobs_df), value=TRUE))
glb_allobs_df$.clusterid <- 1
print(max(table(glb_allobs_df$myCategory.fctr) / 20))
for (myCategory in c("##", "Business#Business Day#Dealbook", "OpEd#Opinion#",
"Styles#U.S.#", "Business#Technology#", "Science#Health#",
"Culture#Arts#")) {
ctgry_allobs_df <- glb_allobs_df[glb_allobs_df$myCategory == myCategory, ]
dstns_dist <- dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_mtrx <- as.matrix(dstns_dist)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
clusters <- hclust(dstns_dist, method = "ward.D2")
#plot(clusters, labels=NULL, hang=-1)
myplclust(clusters, lab.col=unclass(ctgry_allobs_df[, glb_rsp_var]))
#clusterGroups = cutree(clusters, k=7)
clusterGroups <- cutreeDynamic(clusters, minClusterSize=20, method="tree", deepSplit=0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#print(ctgry_allobs_df[which(clusterGroups == 1), c("UniqueID", "Popular", "Headline")])
#print(ctgry_allobs_df[(clusterGroups == 1) & !is.na(ctgry_allobs_df$Popular) & (ctgry_allobs_df$Popular==1), c("UniqueID", "Popular", "Headline")])
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#summary(factor(clusterGroups))
clusterGroups <- clusterGroups +
100 * # has to be > max(table(glb_allobs_df$myCategory.fctr) / minClusterSize=20)
which(levels(glb_allobs_df$myCategory.fctr) == myCategory)
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
# add to glb_allobs_df - then split the data again
glb_allobs_df[glb_allobs_df$myCategory==myCategory,]$.clusterid <- clusterGroups
#print(unique(glb_allobs_df$.clusterid))
#print(glb_feats_df[glb_feats_df$id == ".clusterid.fctr", ])
}
ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
mycreate_sqlxtab_df(glb_allobs_df,
c("myCategory", ".clusterid", glb_rsp_var)))
ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
myCategory + .clusterid ~
Popular.fctr, sum, value.var=".n"))
print(ctgry_cast_df)
#print(orderBy(~ myCategory -Y -NA, ctgry_cast_df))
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_clst.csv"),
# row.names=FALSE)
print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df$.clusterid,
glb_allobs_df[, glb_rsp_var],
useNA="ifany"))
dsp_obs(.clusterid=1, myCategory="OpEd#Opinion#",
cols=c("UniqueID", "Popular", "myCategory", ".clusterid", "Headline"),
all=TRUE)
glb_allobs_df$.clusterid.fctr <- as.factor(glb_allobs_df$.clusterid)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, ".clusterid")
}
## Loading required package: proxy
##
## Attaching package: 'proxy'
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
##
## Loading required package: dynamicTreeCut
## [1] "Clustering features: "
## [1] "H.T.X2014" "H.T.X2015"
## [3] "H.T.art" "H.T.bank"
## [5] "H.T.big" "H.T.billion"
## [7] "H.T.busi" "H.T.china"
## [9] "H.T.daili" "H.T.day"
## [11] "H.T.deal" "H.T.fashion"
## [13] "H.T.first" "H.T.make"
## [15] "H.T.morn" "H.T.new"
## [17] "H.T.news" "H.T.newyork"
## [19] "H.T.obama" "H.T.pictur"
## [21] "H.T.polit" "H.T.report"
## [23] "H.T.say" "H.T.springsumm"
## [25] "H.T.take" "H.T.test"
## [27] "H.T.today" "H.T.week"
## [29] "S.T.articl" "S.T.can"
## [31] "S.T.compani" "S.T.day"
## [33] "S.T.fashion" "S.T.first"
## [35] "S.T.intern" "S.T.make"
## [37] "S.T.new" "S.T.newyork"
## [39] "S.T.newyorktim" "S.T.one"
## [41] "S.T.presid" "S.T.report"
## [43] "S.T.said" "S.T.share"
## [45] "S.T.show" "S.T.take"
## [47] "S.T.time" "S.T.week"
## [49] "S.T.will" "S.T.year"
## [51] "A.T.articl" "A.T.can"
## [53] "A.T.compani" "A.T.day"
## [55] "A.T.fashion" "A.T.first"
## [57] "A.T.intern" "A.T.make"
## [59] "A.T.new" "A.T.newyork"
## [61] "A.T.newyorktim" "A.T.one"
## [63] "A.T.presid" "A.T.report"
## [65] "A.T.said" "A.T.share"
## [67] "A.T.show" "A.T.take"
## [69] "A.T.time" "A.T.week"
## [71] "A.T.will" "A.T.year"
## [73] "H.T.clip" "H.T.ebola"
## [75] "H.T.get" "H.T.newyorktim"
## [77] "H.T.read" "H.T.word"
## [79] "H.nwrds.log" "H.nwrds.unq.log"
## [81] "H.sum.TfIdf" "H.ratio.sum.TfIdf.nwrds"
## [83] "H.nchrs.log" "H.nuppr.log"
## [85] "H.ndgts.log" "H.npnct01.log"
## [87] "H.npnct02.log" "H.npnct03.log"
## [89] "H.npnct04.log" "H.npnct05.log"
## [91] "H.npnct06.log" "H.npnct07.log"
## [93] "H.npnct08.log" "H.npnct09.log"
## [95] "H.npnct10.log" "H.npnct11.log"
## [97] "H.npnct12.log" "H.npnct13.log"
## [99] "H.npnct14.log" "H.npnct15.log"
## [101] "H.npnct16.log" "H.npnct17.log"
## [103] "H.npnct18.log" "H.npnct19.log"
## [105] "H.npnct20.log" "H.npnct21.log"
## [107] "H.npnct22.log" "H.npnct23.log"
## [109] "H.npnct24.log" "H.npnct25.log"
## [111] "H.npnct26.log" "H.npnct27.log"
## [113] "H.npnct28.log" "H.npnct29.log"
## [115] "H.npnct30.log" "H.nstopwrds.log"
## [117] "H.ratio.nstopwrds.nwrds" "H.P.http"
## [119] "H.P.year.colon" "H.P.daily.clip.report"
## [121] "H.P.fashion.week" "H.P.first.draft"
## [123] "H.P.facts.figures" "H.P.friday.night.music"
## [125] "H.P.no.comment.colon" "H.P.on.this.day"
## [127] "H.P.quandary" "H.P.readers.respond"
## [129] "H.P.recap.colon" "H.P.s.notebook"
## [131] "H.P.today.in.politic" "H.P.today.in.smallbusiness"
## [133] "H.P.verbatim.colon" "H.P.what.we.are"
## [135] "S.T.appear" "S.T.archiv"
## [137] "S.T.diari" "S.T.herald"
## [139] "S.T.obama" "S.T.photo"
## [141] "S.T.senat" "S.T.tribun"
## [143] "S.T.word" "S.nwrds.log"
## [145] "S.nwrds.unq.log" "S.sum.TfIdf"
## [147] "S.ratio.sum.TfIdf.nwrds" "S.nchrs.log"
## [149] "S.nuppr.log" "S.ndgts.log"
## [151] "S.npnct01.log" "S.npnct02.log"
## [153] "S.npnct03.log" "S.npnct04.log"
## [155] "S.npnct05.log" "S.npnct06.log"
## [157] "S.npnct07.log" "S.npnct08.log"
## [159] "S.npnct09.log" "S.npnct10.log"
## [161] "S.npnct11.log" "S.npnct12.log"
## [163] "S.npnct13.log" "S.npnct14.log"
## [165] "S.npnct15.log" "S.npnct16.log"
## [167] "S.npnct17.log" "S.npnct18.log"
## [169] "S.npnct19.log" "S.npnct20.log"
## [171] "S.npnct21.log" "S.npnct22.log"
## [173] "S.npnct23.log" "S.npnct24.log"
## [175] "S.npnct25.log" "S.npnct26.log"
## [177] "S.npnct27.log" "S.npnct28.log"
## [179] "S.npnct29.log" "S.npnct30.log"
## [181] "S.nstopwrds.log" "S.ratio.nstopwrds.nwrds"
## [183] "S.P.http" "S.P.year.colon"
## [185] "S.P.daily.clip.report" "S.P.fashion.week"
## [187] "S.P.first.draft" "S.P.metropolitan.diary.colon"
## [189] "A.T.appear" "A.T.archiv"
## [191] "A.T.diari" "A.T.herald"
## [193] "A.T.obama" "A.T.photo"
## [195] "A.T.senat" "A.T.tribun"
## [197] "A.T.word" "A.nwrds.log"
## [199] "A.nwrds.unq.log" "A.sum.TfIdf"
## [201] "A.ratio.sum.TfIdf.nwrds" "A.nchrs.log"
## [203] "A.nuppr.log" "A.ndgts.log"
## [205] "A.npnct01.log" "A.npnct02.log"
## [207] "A.npnct03.log" "A.npnct04.log"
## [209] "A.npnct05.log" "A.npnct06.log"
## [211] "A.npnct07.log" "A.npnct08.log"
## [213] "A.npnct09.log" "A.npnct10.log"
## [215] "A.npnct11.log" "A.npnct12.log"
## [217] "A.npnct13.log" "A.npnct14.log"
## [219] "A.npnct15.log" "A.npnct16.log"
## [221] "A.npnct17.log" "A.npnct18.log"
## [223] "A.npnct19.log" "A.npnct20.log"
## [225] "A.npnct21.log" "A.npnct22.log"
## [227] "A.npnct23.log" "A.npnct24.log"
## [229] "A.npnct25.log" "A.npnct26.log"
## [231] "A.npnct27.log" "A.npnct28.log"
## [233] "A.npnct29.log" "A.npnct30.log"
## [235] "A.nstopwrds.log" "A.ratio.nstopwrds.nwrds"
## [237] "A.P.http" "A.P.year.colon"
## [239] "A.P.daily.clip.report" "A.P.fashion.week"
## [241] "A.P.first.draft" "A.P.metropolitan.diary.colon"
## [1] 80.55
## [1] "max distance(0.3145) pair:"
## UniqueID Popular myCategory
## 2819 2819 0 ##
## 7744 7744 NA ##
## Headline
## 2819 R&B 2.0
## 7744 From the Archives: Article From 2009 About Gross's Detention in Cuba
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 2819 0 0 0 0 0 0
## 7744 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion
## 2819 0 0.000000 0.3880949 0 0 0
## 7744 0 1.656809 0.0000000 0 0 0
## S.T.first S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim
## 2819 0 0 0 0 0 0.000000
## 7744 0 0 0 0 0 1.527063
## S.T.one S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## S.T.time S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani
## 2819 0 0 0 0 0.000000 0.3880949 0
## 7744 0 0 0 0 1.656809 0.0000000 0
## A.T.day A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 2819 0.000000 0 0 0 0 0
## 7744 1.524331 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 2819 0 0 0 0 0 0 0
## 7744 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 2819 0 0 0 0 0 1.609438
## 7744 0 0 0 0 0 2.564949
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 2819 0.00000 0.00000 0.0000000 2.079442
## 7744 1.94591 10.53923 0.8782694 4.234107
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 2819 1.098612 1.098612 0 0 0
## 7744 2.197225 1.609438 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 2819 0 0 0.6931472 0.0000000 0
## 7744 0 0 0.0000000 0.6931472 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 2819 0 0 0 0 0.6931472
## 7744 0 0 0 0 0.0000000
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 2819 0 0.0000000 0 0 0
## 7744 0 0.6931472 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 2819 0 0 0 0 0
## 7744 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 2819 0.6931472 0 0 0 0
## 7744 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 2819 0 0 0.000000 0.2000000
## 7744 0 0 1.098612 0.2307692
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 2819 0 0 0 0
## 7744 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 2819 0 0 0
## 7744 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 2819 0 0 0 0
## 7744 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 2819 0 0 0
## 7744 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 2819 0 0 0
## 7744 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 2819 0 0 0 0 0 0
## 7744 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 2819 0 0 0 3.218876 2.564949 7.895172
## 7744 0 0 0 1.791759 1.386294 6.001057
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 2819 0.3289655 4.934474 1.609438 0
## 7744 1.2002115 3.496508 1.386294 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 2819 0 0 0 0 0
## 7744 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 2819 0.6931472 0 0 0 0
## 7744 0.0000000 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 2819 0.6931472 0 1.0986123 0 0
## 7744 0.0000000 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 2819 0 0 0 0 0
## 7744 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 2819 0 0 0 0.6931472 0
## 7744 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 2819 0 0 0 0 0
## 7744 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 2819 2.3025851 0.4000000 0 0
## 7744 0.6931472 0.3333333 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 2819 0 0 0
## 7744 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 2819 0 0 0 0
## 7744 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 2819 0 0 0 0 0 0
## 7744 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 2819 3.218876 2.564949 7.894199 0.3289249
## 7744 1.791759 1.386294 5.998325 1.1996650
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 2819 4.934474 1.609438 0 0 0
## 7744 3.496508 1.386294 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 2819 0 0 0 0.6931472 0
## 7744 0 0 0 0.0000000 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 2819 0 0 0 0.6931472 0
## 7744 0 0 0 0.0000000 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 2819 1.0986123 0 0 0 0
## 7744 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 2819 0 0 0 0 0
## 7744 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 2819 0 0.6931472 0 0 0
## 7744 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 2819 0 0 0 2.3025851
## 7744 0 0 0 0.6931472
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 2819 0.4000000 0 0 0
## 7744 0.3333333 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 2819 0 0 0
## 7744 0 0 0
## [1] "min distance(-0.0000) pair:"
## UniqueID Popular myCategory Headline H.T.X2014 H.T.X2015
## 849 849 0 ## The Weekly Wrap 0 0
## 1984 1984 0 ## The Weekly Wrap 0 0
## H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china H.T.daili
## 849 0 0 0 0 0 0 0
## 1984 0 0 0 0 0 0 0
## H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn H.T.new
## 849 0 0 0 0 0 0 0
## 1984 0 0 0 0 0 0 0
## H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit H.T.report
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## H.T.say H.T.springsumm H.T.take H.T.test H.T.today H.T.week
## 849 0 0 0 0 0 2.479851
## 1984 0 0 0 0 0 2.479851
## S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 849 0 0 0 0 0 0 0.9733184
## 1984 0 0 0 0 0 0 0.9733184
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 849 0.787697 0 0 0 0 0 0
## 1984 0.787697 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 849 0 0 0.9723183 0.787697 0 0 0
## 1984 0 0 0.9723183 0.787697 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 849 0 0 0 0 0 1.386294
## 1984 0 0 0 0 0 1.386294
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 849 1.098612 7.337145 2.445715 2.833213
## 1984 1.098612 7.337145 2.445715 2.833213
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 849 1.386294 0 0 0 0
## 1984 1.386294 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 849 0.6931472 0 0 0 0
## 1984 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 849 0 0 0 0.25
## 1984 0 0 0 0.25
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 849 0 0 0 0
## 1984 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 849 0 0 0
## 1984 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 849 0 0 0 0
## 1984 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 849 0 0 0
## 1984 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 849 0 0 0
## 1984 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 849 0 0 0 2.197225 1.791759 5.433504
## 1984 0 0 0 2.197225 1.791759 5.433504
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 849 0.679188 3.871201 1.386294 0
## 1984 0.679188 3.871201 1.386294 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 849 0 0 0.6931472 0 0
## 1984 0 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 849 0 0 0 0.6931472 0
## 1984 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 849 1.386294 0.4444444 0 0
## 1984 1.386294 0.4444444 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 849 0 0 0
## 1984 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 849 0 0 0 0
## 1984 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 849 0 0 0 0 0 0
## 1984 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 849 2.197225 1.791759 5.428165 0.6785206
## 1984 2.197225 1.791759 5.428165 0.6785206
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 849 3.871201 1.386294 0 0 0
## 1984 3.871201 1.386294 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 849 0.6931472 0 0 0 0
## 1984 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 849 0 0 0 0 0
## 1984 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 849 0 0.6931472 0 0 0
## 1984 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 849 0 0 0 1.386294
## 1984 0 0 0 1.386294
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 849 0.4444444 0 0 0
## 1984 0.4444444 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 849 0 0 0
## 1984 0 0 0
## [1] "max distance(0.0829) pair:"
## UniqueID Popular myCategory
## 7220 7220 NA Business#Business Day#Dealbook
## 7941 7941 NA Business#Business Day#Dealbook
## Headline H.T.X2014
## 7220 Lending Club Set to Debut, and Industry Is Watching 0
## 7941 1. ___________ Weighs In on Goldman's Crossword Puzzle 0
## H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china
## 7220 0 0 0 0 0 0 0
## 7941 0 0 0 0 0 0 0
## H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn
## 7220 0 0 0 0 0 0 0
## 7941 0 0 0 0 0 0 0
## H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 7220 0 0 0 0 0 0
## 7941 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 7220 0 0 0 0 0 0
## 7941 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 7220 0 0 0 0.5530365 0 0 0
## 7941 0 0 0 0.0000000 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 7220 0 0 0 0 0 0
## 7941 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 7220 0 0 0 0.3346473 0 0 0
## 7941 0 0 0 0.0000000 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 7220 0 0 0.2561173 0 0 0.5530365 0
## 7941 0 0 0.0000000 0 0 0.0000000 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 7220 0 0 0 0 0 0
## 7941 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 7220 0 0 0 0 0 0.3342766
## 7941 0 0 0 0 0 0.0000000
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 7220 0 0 0 0 0 0.2561173 0
## 7941 0 0 0 0 0 0.0000000 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 7220 0 0 0 0 0 2.302585
## 7941 0 0 0 0 0 2.302585
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 7220 1.945910 8.255108 0.9172343 3.951244
## 7941 1.609438 8.868079 0.9853422 4.007333
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 7220 2.079442 0.0000000 0 0 0
## 7941 1.791759 0.6931472 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 7220 0 0 0 0.0000000 0
## 7941 0 0 0 0.6931472 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 7220 0 0 0.6931472 0 0.0000000
## 7941 0 0 0.0000000 0 0.6931472
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 7220 0.6931472 0.000000 0 0 0
## 7941 0.6931472 2.484907 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 7220 0 0 1.0986123 0.3
## 7941 0 0 0.6931472 0.2
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 7220 0 0 0 0
## 7941 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 7220 0 0 0
## 7941 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 7220 0 0 0 0
## 7941 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 7220 0 0 0
## 7941 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 7220 0 0 0
## 7941 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 7220 0 0 0 0 0 0
## 7941 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 7220 0 0 0.000000 3.583519 2.708050 6.081635
## 7941 0 0 1.102591 2.397895 1.791759 8.002660
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 7220 0.173761 5.192957 1.098612 2.197225
## 7941 0.800266 4.143135 1.791759 0.000000
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 7220 0 0 0 1.609438 0
## 7941 0 0 0 0.000000 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 7220 1.098612 1.098612 1.3862944 0 0.0000000
## 7941 0.000000 0.000000 0.6931472 0 0.6931472
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 7220 0 0 0 0.6931472 0
## 7941 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 7220 2.639057 0.3888889 0 0
## 7941 1.609438 0.4545455 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 7220 0 0 0
## 7941 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 7220 0 0 0 0
## 7941 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 7220 0 0 0 0 0 0.000000
## 7941 0 0 0 0 0 1.102591
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 7220 3.583519 2.708050 6.079762 0.1737075
## 7941 2.397895 1.791759 8.002660 0.8002660
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 7220 5.192957 1.098612 2.197225 0 0
## 7941 4.143135 1.791759 0.000000 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 7220 0 1.609438 0 0 0
## 7941 0 0.000000 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 7220 0 0 0 1.098612 1.098612
## 7941 0 0 0 0.000000 0.000000
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 7220 1.3862944 0 0.0000000 0 0
## 7941 0.6931472 0 0.6931472 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 7220 0 0 0 0 0
## 7941 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 7220 0 0.6931472 0 0 0
## 7941 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 7220 0 0 0 2.639057
## 7941 0 0 0 1.609438
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 7220 0.3888889 0 0 0
## 7941 0.4545455 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 7220 0 0 0
## 7941 0 0 0
## [1] "min distance(0.0003) pair:"
## UniqueID Popular myCategory
## 5075 5075 1 Business#Business Day#Dealbook
## 7833 7833 NA Business#Business Day#Dealbook
## Headline
## 5075 Fannie Mae Chief Details Plan to Ease Mortgage Rules
## 7833 Ally Financial Receives Subpoena in Subprime Auto Loan Inquiry
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 5075 0 0 0 0 0 0
## 7833 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 5075 0 0 0 0 0 0
## 7833 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 5075 0 0 0.307071 0 0 0 0
## 7833 0 0 0.307071 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 5075 0 0 0 0 0 0
## 7833 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 5075 0 0 0 0 0.307071 0
## 7833 0 0 0 0 0.307071 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 5075 0 0 0 0 0 0 0
## 7833 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 5075 0 0 0 0 0 2.302585
## 7833 0 0 0 0 0 2.302585
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 5075 2.197225 9.437621 1.048625 3.970292
## 7833 2.197225 9.853723 1.094858 4.143135
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 5075 2.197225 0 0 0 0
## 7833 2.197225 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 5075 0.6931472 0 0 0 0
## 7833 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 5075 0 0 0.6931472 0.2
## 7833 0 0 0.6931472 0.2
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 5075 0 0 0 0
## 7833 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 5075 0 0 0
## 7833 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 5075 0 0 0 0
## 7833 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 5075 0 0 0
## 7833 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 5075 0 0 0
## 7833 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 5075 0 0 0 0 0 0
## 7833 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 5075 0 0 0 3.218876 2.639057 8.084739
## 7833 0 0 0 3.218876 2.708050 8.312740
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 5075 0.3368641 5.010635 1.609438 0
## 7833 0.3463642 5.117994 1.386294 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 5075 0 0 0.6931472 0 0
## 7833 0 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 5075 0 0 0 0.6931472 0
## 7833 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 5075 2.197225 0.36 0 0
## 7833 2.302585 0.40 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 5075 0 0 0
## 7833 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 5075 0 0 0 0
## 7833 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 5075 0 0 0 0 0 0
## 7833 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 5075 3.218876 2.639057 8.079100 0.3366292
## 7833 3.218876 2.708050 8.309612 0.3462338
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 5075 5.010635 1.609438 0 0 0
## 7833 5.117994 1.386294 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 5075 0.6931472 0 0 0 0
## 7833 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 5075 0 0 0 0 0
## 7833 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 5075 0 0.6931472 0 0 0
## 7833 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 5075 0 0 0 2.197225
## 7833 0 0 0 2.302585
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 5075 0.36 0 0 0
## 7833 0.40 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 5075 0 0 0
## 7833 0 0 0
## [1] "max distance(0.6260) pair:"
## UniqueID Popular myCategory Headline
## 3337 3337 1 OpEd#Opinion# The State of Macro, Six Years Later
## 6680 6680 NA OpEd#Opinion# Inequality and Economic Performance
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 3337 0 0 0 0 0 0 0
## 6680 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 3337 0 0 0 0 0 2.079442
## 6680 0 0 0 0 0 1.609438
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 3337 1.791759 9.371850 1.338836 3.583519
## 6680 1.386294 8.695959 2.173990 3.583519
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 3337 1.945910 0 0 0 0
## 6680 1.386294 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 3337 0 0 0.6931472 0 0
## 6680 0 0 0.0000000 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 3337 0.6931472 0 0 0 0
## 6680 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 3337 0 0 0.6931472 0.25
## 6680 0 0 0.6931472 0.40
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 3337 0 0 0 0
## 6680 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 3337 0 0 0
## 6680 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 3337 0 0 0 0
## 6680 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 3337 0 0 0
## 6680 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 3337 0 0 0
## 6680 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 3337 0 0 0 1.7917595 0.0000000 0.00000
## 6680 0 0 0 0.6931472 0.6931472 13.03652
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 3337 0.00000 2.890372 1.3862944 0
## 6680 13.03652 2.639057 0.6931472 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 3337 0 0 0.6931472 0 0
## 6680 0 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 3337 0 0 0 0.6931472 0
## 6680 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 3337 1.386294 0.6666667 0 0
## 6680 0.000000 0.5000000 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 3337 0 0 0
## 6680 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 3337 0 0 0 0
## 6680 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 3337 0 0 0 0 0 0
## 6680 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 3337 1.7917595 0.0000000 0.00000 0.00000
## 6680 0.6931472 0.6931472 13.03652 13.03652
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 3337 2.890372 1.3862944 0 0 0
## 6680 2.639057 0.6931472 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 3337 0.6931472 0 0 0 0
## 6680 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 3337 0 0 0 0 0
## 6680 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 3337 0 0.6931472 0 0 0
## 6680 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 3337 0 0 0 1.386294
## 6680 0 0 0 0.000000
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 3337 0.6666667 0 0 0
## 6680 0.5000000 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 3337 0 0 0
## 6680 0 0 0
## [1] "min distance(0.0003) pair:"
## UniqueID Popular myCategory Headline H.T.X2014
## 2569 2569 0 OpEd#Opinion# Joe on WNYC's Money Talking 0
## 7927 7927 NA OpEd#Opinion# Joe on WNYC's Money Talking 0
## H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 2569 0 0 0 0 0 0 0
## 7927 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 2569 0 0 0 0 0 1.94591
## 7927 0 0 0 0 0 1.94591
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 2569 1.609438 8.296223 1.382704 3.332205
## 7927 1.609438 8.296223 1.382704 3.332205
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 2569 2.079442 0 0 0 0
## 7927 2.079442 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 2569 0 0 0 0.6931472 0
## 7927 0 0 0 0.6931472 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 2569 0.6931472 0 0 0 0
## 7927 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 2569 0 0 0.6931472 0.2857143
## 7927 0 0 0.6931472 0.2857143
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 2569 0 0 0 0
## 7927 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 2569 0 0 0
## 7927 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 2569 0 0 0 0
## 7927 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 2569 0 0 0
## 7927 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 2569 0 0 0
## 7927 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 2569 0 0 0 2.944439 2.484907 8.618251
## 7927 0 0 0 2.833213 2.397895 8.637717
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 2569 0.4787917 4.672829 1.94591 0
## 7927 0.5398573 4.663439 1.94591 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 2569 0 0 0.6931472 0 0
## 7927 0 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 2569 0 0 0 0.6931472 0
## 7927 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 2569 2.079442 0.4210526 0 0
## 7927 1.791759 0.3529412 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 2569 0 0 0
## 7927 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 2569 0 0 0 0
## 7927 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 2569 0 0 0 0 0 0
## 7927 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 2569 2.944439 2.484907 8.615236 0.4786242
## 7927 2.833213 2.397895 8.623427 0.5389642
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 2569 4.672829 1.94591 0 0 0
## 7927 4.663439 1.94591 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 2569 0.6931472 0 0 0 0
## 7927 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 2569 0 0 0 0 0
## 7927 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 2569 0 0.6931472 0 0 0
## 7927 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 2569 0 0 0 2.079442
## 7927 0 0 0 1.791759
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 2569 0.4210526 0 0 0
## 7927 0.3529412 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 2569 0 0 0
## 7927 0 0 0
## [1] "max distance(0.1175) pair:"
## UniqueID Popular myCategory Headline
## 2514 2514 0 Styles#U.S.# Fairy House for Sale. No Financing.
## 3210 3210 1 Styles#U.S.# My Toddler, Bartleby
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 2514 0 0 0 0 0 0 0
## 3210 0 0 0 0 0 0 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 2514 0 0 0 0 0 0 0
## 3210 0 0 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 2514 0 0 0 0 0 0 0
## 3210 0 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 2514 0 0 0 0 0 0
## 3210 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 2514 0 0 0 0.1595298 0 0 0.1895382
## 3210 0 0 0 0.0000000 0 0 0.0000000
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 2514 0 0 0 0 0 0
## 3210 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 2514 0 0 0 0 0 0 0
## 3210 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 2514 0 0 0 0 0 0.112102 0
## 3210 0 0 0 0 0 0.000000 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 2514 0 0.133189 0 0 0 0
## 3210 0 0.000000 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 2514 0 0 0 0 0 0
## 3210 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 2514 0 0 0 0 0 0 0
## 3210 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 2514 0 0 0 0 0 1.945910
## 3210 0 0 0 0 0 1.386294
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 2514 1.609438 8.823848 1.470641 3.583519
## 3210 1.098612 12.536517 4.178839 3.044522
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 2514 1.791759 0 0 0 0
## 3210 1.386294 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 2514 0 0 0.0000000 0 1.098612
## 3210 0 0 0.6931472 0 0.000000
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 2514 0.6931472 0 0 0 0
## 3210 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 2514 0 0 0.6931472 0.2857143
## 3210 0 0 0.0000000 0.2500000
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 2514 0 0 0 0
## 3210 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 2514 0 0 0
## 3210 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 2514 0 0 0 0
## 3210 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 2514 0 0 0
## 3210 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 2514 0 0 0
## 3210 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 2514 0 0 0 0 0 0
## 3210 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 2514 0 0 0 3.850148 3.295837 7.741935
## 3210 0 0 0 2.995732 2.197225 10.730398
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 2514 0.1683029 5.488938 2.302585 1.609438
## 3210 0.5647578 4.595120 1.098612 0.000000
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 2514 0 0 0 0.6931472 0
## 3210 0 0 0 0.0000000 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 2514 1.098612 0.6931472 1.9459101 0 0
## 3210 0.000000 0.0000000 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 2514 0 0 0 0.6931472 0
## 3210 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 2514 0 0 0 0 0
## 3210 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 2514 2.708050 0.3191489 0 0
## 3210 2.302585 0.5000000 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 2514 0 0 0
## 3210 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 2514 0 0 0 0
## 3210 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 2514 0 0 0 0 0 0
## 3210 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 2514 4.663439 3.637586 8.331818 0.07935065
## 3210 2.995732 2.197225 10.730398 0.56475777
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 2514 6.327937 2.484907 3.931826 0 1.609438
## 3210 4.595120 1.098612 0.000000 0 0.000000
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 2514 0.6931472 0.6931472 0 0 0
## 3210 0.0000000 0.0000000 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 2514 1.098612 0 0 1.098612 2.484907
## 3210 0.000000 0 0 0.000000 0.000000
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 2514 2.4849066 2.944439 1.098612 0 2.197225
## 3210 0.6931472 0.000000 0.000000 0 0.000000
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 2514 1.609438 1.098612 0 0 0
## 3210 0.000000 0.000000 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 2514 0 0.6931472 0 0 0
## 3210 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 2514 0 0 0 3.465736
## 3210 0 0 0 2.302585
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 2514 0.3018868 2 0 0
## 3210 0.5000000 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 2514 0 0 0
## 3210 0 0 0
## [1] "min distance(-0.0000) pair:"
## UniqueID Popular myCategory
## 344 344 0 Styles#U.S.#
## 888 888 1 Styles#U.S.#
## Headline H.T.X2014 H.T.X2015
## 344 Your Turn: A Weekend Thread, Open for Comments 0 0
## 888 Your Turn: A Weekend Thread, Open for Comments 0 0
## H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china H.T.daili
## 344 0 0 0 0 0 0 0
## 888 0 0 0 0 0 0 0
## H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn H.T.new
## 344 0 0 0 0 0 0 0
## 888 0 0 0 0 0 0 0
## H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit H.T.report H.T.say
## 344 0 0 0 0 0 0 0
## 888 0 0 0 0 0 0 0
## H.T.springsumm H.T.take H.T.test H.T.today H.T.week S.T.articl S.T.can
## 344 0 0 0 0 0 0 0
## 888 0 0 0 0 0 0 0
## S.T.compani S.T.day S.T.fashion S.T.first S.T.intern S.T.make S.T.new
## 344 0 0 0 0 0 0 0
## 888 0 0 0 0 0 0 0
## S.T.newyork S.T.newyorktim S.T.one S.T.presid S.T.report S.T.said
## 344 0 0 0 0 0 0
## 888 0 0 0 0 0 0
## S.T.share S.T.show S.T.take S.T.time S.T.week S.T.will S.T.year
## 344 0.3346473 0 0 0 0.5251313 0 0
## 888 0.3346473 0 0 0 0.5251313 0 0
## A.T.articl A.T.can A.T.compani A.T.day A.T.fashion A.T.first
## 344 0 0 0 0 0 0
## 888 0 0 0 0 0 0
## A.T.intern A.T.make A.T.new A.T.newyork A.T.newyorktim A.T.one
## 344 0 0 0 0 0 0
## 888 0 0 0 0 0 0
## A.T.presid A.T.report A.T.said A.T.share A.T.show A.T.take A.T.time
## 344 0 0 0 0.3342766 0 0 0
## 888 0 0 0 0.3342766 0 0 0
## A.T.week A.T.will A.T.year H.T.clip H.T.ebola H.T.get H.T.newyorktim
## 344 0.5251313 0 0 0 0 0 0
## 888 0.5251313 0 0 0 0 0 0
## H.T.read H.T.word H.nwrds.log H.nwrds.unq.log H.sum.TfIdf
## 344 0 0 2.197225 1.791759 7.714237
## 888 0 0 2.197225 1.791759 7.714237
## H.ratio.sum.TfIdf.nwrds H.nchrs.log H.nuppr.log H.ndgts.log
## 344 0.9642796 3.850148 2.079442 0
## 888 0.9642796 3.850148 2.079442 0
## H.npnct01.log H.npnct02.log H.npnct03.log H.npnct04.log H.npnct05.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## H.npnct06.log H.npnct07.log H.npnct08.log H.npnct09.log H.npnct10.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## H.npnct11.log H.npnct12.log H.npnct13.log H.npnct14.log H.npnct15.log
## 344 0.6931472 0 0 0 0.6931472
## 888 0.6931472 0 0 0 0.6931472
## H.npnct16.log H.npnct17.log H.npnct18.log H.npnct19.log H.npnct20.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## H.npnct21.log H.npnct22.log H.npnct23.log H.npnct24.log H.npnct25.log
## 344 0 0 0 0.6931472 0
## 888 0 0 0 0.6931472 0
## H.npnct26.log H.npnct27.log H.npnct28.log H.npnct29.log H.npnct30.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## H.nstopwrds.log H.ratio.nstopwrds.nwrds H.P.http H.P.year.colon
## 344 0.6931472 0.2222222 0 0
## 888 0.6931472 0.2222222 0 0
## H.P.daily.clip.report H.P.fashion.week H.P.first.draft
## 344 0 0 0
## 888 0 0 0
## H.P.facts.figures H.P.friday.night.music H.P.no.comment.colon
## 344 0 0 0
## 888 0 0 0
## H.P.on.this.day H.P.quandary H.P.readers.respond H.P.recap.colon
## 344 0 0 0 0
## 888 0 0 0 0
## H.P.s.notebook H.P.today.in.politic H.P.today.in.smallbusiness
## 344 0 0 0
## 888 0 0 0
## H.P.verbatim.colon H.P.what.we.are S.T.appear S.T.archiv S.T.diari
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## S.T.herald S.T.obama S.T.photo S.T.senat S.T.tribun S.T.word
## 344 0 0 0 0 0 0
## 888 0 0 0 0 0 0
## S.nwrds.log S.nwrds.unq.log S.sum.TfIdf S.ratio.sum.TfIdf.nwrds
## 344 3.526361 2.70805 6.561495 0.1988332
## 888 3.526361 2.70805 6.561495 0.1988332
## S.nchrs.log S.nuppr.log S.ndgts.log S.npnct01.log S.npnct02.log
## 344 5.187386 2.079442 0 0 0
## 888 5.187386 2.079442 0 0 0
## S.npnct03.log S.npnct04.log S.npnct05.log S.npnct06.log S.npnct07.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## S.npnct08.log S.npnct09.log S.npnct10.log S.npnct11.log S.npnct12.log
## 344 0 0 0 0.6931472 0
## 888 0 0 0 0.6931472 0
## S.npnct13.log S.npnct14.log S.npnct15.log S.npnct16.log S.npnct17.log
## 344 1.386294 0 0 0 0
## 888 1.386294 0 0 0 0
## S.npnct18.log S.npnct19.log S.npnct20.log S.npnct21.log S.npnct22.log
## 344 0 1.386294 0 0 0
## 888 0 1.386294 0 0 0
## S.npnct23.log S.npnct24.log S.npnct25.log S.npnct26.log S.npnct27.log
## 344 0 0.6931472 0 0 0
## 888 0 0.6931472 0 0 0
## S.npnct28.log S.npnct29.log S.npnct30.log S.nstopwrds.log
## 344 0 0 0 2.639057
## 888 0 0 0 2.639057
## S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon S.P.daily.clip.report
## 344 0.4117647 0 0 0
## 888 0.4117647 0 0 0
## S.P.fashion.week S.P.first.draft S.P.metropolitan.diary.colon
## 344 0 0 0
## 888 0 0 0
## A.T.appear A.T.archiv A.T.diari A.T.herald A.T.obama A.T.photo
## 344 0 0 0 0 0 0
## 888 0 0 0 0 0 0
## A.T.senat A.T.tribun A.T.word A.nwrds.log A.nwrds.unq.log A.sum.TfIdf
## 344 0 0 0 3.526361 2.70805 6.557008
## 888 0 0 0 3.526361 2.70805 6.557008
## A.ratio.sum.TfIdf.nwrds A.nchrs.log A.nuppr.log A.ndgts.log
## 344 0.1986972 5.187386 2.079442 0
## 888 0.1986972 5.187386 2.079442 0
## A.npnct01.log A.npnct02.log A.npnct03.log A.npnct04.log A.npnct05.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## A.npnct06.log A.npnct07.log A.npnct08.log A.npnct09.log A.npnct10.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## A.npnct11.log A.npnct12.log A.npnct13.log A.npnct14.log A.npnct15.log
## 344 0.6931472 0 1.386294 0 0
## 888 0.6931472 0 1.386294 0 0
## A.npnct16.log A.npnct17.log A.npnct18.log A.npnct19.log A.npnct20.log
## 344 0 0 0 1.386294 0
## 888 0 0 0 1.386294 0
## A.npnct21.log A.npnct22.log A.npnct23.log A.npnct24.log A.npnct25.log
## 344 0 0 0 0.6931472 0
## 888 0 0 0 0.6931472 0
## A.npnct26.log A.npnct27.log A.npnct28.log A.npnct29.log A.npnct30.log
## 344 0 0 0 0 0
## 888 0 0 0 0 0
## A.nstopwrds.log A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon
## 344 2.639057 0.4117647 0 0
## 888 2.639057 0.4117647 0 0
## A.P.daily.clip.report A.P.fashion.week A.P.first.draft
## 344 0 0 0
## 888 0 0 0
## A.P.metropolitan.diary.colon
## 344 0
## 888 0
## [1] "max distance(0.0863) pair:"
## UniqueID Popular myCategory
## 5516 5516 0 Business#Technology#
## 6983 6983 NA Business#Technology#
## Headline
## 5516 Embracing HTTPS
## 6983 Daily Report: New Financing Round for Uber Puts Its Value at $40 Billion
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 5516 0 0 0 0 0 0.0000000 0
## 6983 0 0 0 0 0 0.6758134 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 5516 0 0.0000000 0 0 0 0 0
## 6983 0 0.5868477 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur
## 5516 0 0.0000000 0 0 0 0
## 6983 0 0.5341887 0 0 0 0
## H.T.polit H.T.report H.T.say H.T.springsumm H.T.take H.T.test
## 5516 0 0.0000000 0 0 0 0
## 6983 0 0.5481366 0 0 0 0
## H.T.today H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion
## 5516 0 0 0 0 0 0 0
## 6983 0 0 0 0 0 0 0
## S.T.first S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim
## 5516 0 0 0 0.0000000 0 0
## 6983 0 0 0 0.2932373 0 0
## S.T.one S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take
## 5516 0 0 0 0 0 0 0
## 6983 0 0 0 0 0 0 0
## S.T.time S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani
## 5516 0 0 0 0 0 0 0
## 6983 0 0 0 0 0 0 0
## A.T.day A.T.fashion A.T.first A.T.intern A.T.make A.T.new
## 5516 0 0 0 0 0 0.0000000
## 6983 0 0 0 0 0 0.2930734
## A.T.newyork A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said
## 5516 0 0 0 0 0 0
## 6983 0 0 0 0 0 0
## A.T.share A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year
## 5516 0 0 0 0 0 0 0
## 6983 0 0 0 0 0 0 0
## H.T.clip H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word
## 5516 0 0 0 0 0 0
## 6983 0 0 0 0 0 0
## H.nwrds.log H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds
## 5516 1.098612 1.098612 11.451555 5.7257773
## 6983 2.639057 2.302585 7.200533 0.5538872
## H.nchrs.log H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log
## 5516 2.772589 1.945910 0.000000 0 0
## 6983 4.304065 2.397895 1.098612 0 0
## H.npnct03.log H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log
## 5516 0 0.0000000 0 0 0
## 6983 0 0.6931472 0 0 0
## H.npnct08.log H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## H.npnct13.log H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log
## 5516 0 0 0.0000000 0 0
## 6983 0 0 0.6931472 0 0
## H.npnct18.log H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## H.npnct23.log H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log
## 5516 0 0.6931472 0 0 0
## 6983 0 0.6931472 0 0 0
## H.npnct28.log H.npnct29.log H.npnct30.log H.nstopwrds.log
## 5516 0 0 0 0.000000
## 6983 0 0 0 1.098612
## H.ratio.nstopwrds.nwrds H.P.http H.P.year.colon H.P.daily.clip.report
## 5516 0.3333333 0 0 0
## 6983 0.2142857 0 0 0
## H.P.fashion.week H.P.first.draft H.P.facts.figures
## 5516 0 0 0
## 6983 0 0 0
## H.P.friday.night.music H.P.no.comment.colon H.P.on.this.day
## 5516 0 0 0
## 6983 0 0 0
## H.P.quandary H.P.readers.respond H.P.recap.colon H.P.s.notebook
## 5516 0 0 0 0
## 6983 0 0 0 0
## H.P.today.in.politic H.P.today.in.smallbusiness H.P.verbatim.colon
## 5516 0 0 0
## 6983 0 0 0
## H.P.what.we.are S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama
## 5516 0 0 0 0 0 0
## 6983 0 0 0 0 0 0
## S.T.photo S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log
## 5516 0 0 0 0 3.218876 2.772589
## 6983 0 0 0 0 3.135494 2.484907
## S.sum.TfIdf S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log
## 5516 8.389406 0.3495586 5.081404 2.3978953
## 6983 6.885283 0.3129674 4.828314 0.6931472
## S.ndgts.log S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log
## 5516 0.000000 0 0 0 0.000000
## 6983 1.609438 0 0 0 1.098612
## S.npnct05.log S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## S.npnct10.log S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log
## 5516 0 1.3862944 0 0.6931472 0
## 6983 0 0.6931472 0 1.0986123 0
## S.npnct15.log S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## S.npnct20.log S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log
## 5516 0 0 0 0 0.6931472
## 6983 0 0 0 0 0.6931472
## S.npnct25.log S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## S.npnct30.log S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http
## 5516 0 2.079442 0.3200000 0
## 6983 0 1.945910 0.3043478 0
## S.P.year.colon S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 5516 0 0 0 0
## 6983 0 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 5516 0 0 0 0
## 6983 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 5516 0 0 0 0 0 0
## 6983 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 5516 3.218876 2.772589 8.389406 0.3495586
## 6983 3.135494 2.484907 6.877865 0.3126302
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 5516 5.081404 2.3978953 0.000000 0 0
## 6983 4.828314 0.6931472 1.609438 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 5516 0 0.000000 0 0 0
## 6983 0 1.098612 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 5516 0 0 0 1.3862944 0
## 6983 0 0 0 0.6931472 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 5516 0.6931472 0 0 0 0
## 6983 1.0986123 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 5516 0 0 0 0 0
## 6983 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 5516 0 0.6931472 0 0 0
## 6983 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 5516 0 0 0 2.079442
## 6983 0 0 0 1.945910
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 5516 0.3200000 0 0 0
## 6983 0.3043478 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 5516 0 0 0
## 6983 0 0 0
## [1] "min distance(0.0009) pair:"
## UniqueID Popular myCategory
## 2048 2048 0 Business#Technology#
## 6951 6951 NA Business#Technology#
## Headline H.T.X2014 H.T.X2015 H.T.art
## 2048 Court Upholds Ban on Uber in Berlin 0 0 0
## 6951 Apple iPod Lawsuit Down to One Plaintiff 0 0 0
## H.T.bank H.T.big H.T.billion H.T.busi H.T.china H.T.daili H.T.day
## 2048 0 0 0 0 0 0 0
## 6951 0 0 0 0 0 0 0
## H.T.deal H.T.fashion H.T.first H.T.make H.T.morn H.T.new H.T.news
## 2048 0 0 0 0 0 0 0
## 6951 0 0 0 0 0 0 0
## H.T.newyork H.T.obama H.T.pictur H.T.polit H.T.report H.T.say
## 2048 0 0 0 0 0 0
## 6951 0 0 0 0 0 0
## H.T.springsumm H.T.take H.T.test H.T.today H.T.week S.T.articl
## 2048 0 0 0 0 0 0
## 6951 0 0 0 0 0 0
## S.T.can S.T.compani S.T.day S.T.fashion S.T.first S.T.intern S.T.make
## 2048 0 0 0 0 0 0 0
## 6951 0 0 0 0 0 0 0
## S.T.new S.T.newyork S.T.newyorktim S.T.one S.T.presid S.T.report
## 2048 0 0 0 0.0000000 0 0
## 6951 0 0 0 0.2622993 0 0
## S.T.said S.T.share S.T.show S.T.take S.T.time S.T.week S.T.will
## 2048 0 0 0 0 0.0000000 0 0
## 6951 0 0 0 0 0.2862701 0 0
## S.T.year A.T.articl A.T.can A.T.compani A.T.day A.T.fashion A.T.first
## 2048 0 0 0 0 0 0 0
## 6951 0 0 0 0 0 0 0
## A.T.intern A.T.make A.T.new A.T.newyork A.T.newyorktim A.T.one
## 2048 0 0 0 0 0 0.0000000
## 6951 0 0 0 0 0 0.2620774
## A.T.presid A.T.report A.T.said A.T.share A.T.show A.T.take A.T.time
## 2048 0 0 0 0 0 0 0.000000
## 6951 0 0 0 0 0 0 0.285976
## A.T.week A.T.will A.T.year H.T.clip H.T.ebola H.T.get H.T.newyorktim
## 2048 0 0 0 0 0 0 0
## 6951 0 0 0 0 0 0 0
## H.T.read H.T.word H.nwrds.log H.nwrds.unq.log H.sum.TfIdf
## 2048 0 0 2.079442 1.791759 9.209497
## 6951 0 0 2.079442 1.791759 8.971094
## H.ratio.sum.TfIdf.nwrds H.nchrs.log H.nuppr.log H.ndgts.log
## 2048 1.315642 3.583519 1.791759 0
## 6951 1.281585 3.713572 1.945910 0
## H.npnct01.log H.npnct02.log H.npnct03.log H.npnct04.log H.npnct05.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## H.npnct06.log H.npnct07.log H.npnct08.log H.npnct09.log H.npnct10.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## H.npnct11.log H.npnct12.log H.npnct13.log H.npnct14.log H.npnct15.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## H.npnct16.log H.npnct17.log H.npnct18.log H.npnct19.log H.npnct20.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## H.npnct21.log H.npnct22.log H.npnct23.log H.npnct24.log H.npnct25.log
## 2048 0 0 0 0.6931472 0
## 6951 0 0 0 0.6931472 0
## H.npnct26.log H.npnct27.log H.npnct28.log H.npnct29.log H.npnct30.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## H.nstopwrds.log H.ratio.nstopwrds.nwrds H.P.http H.P.year.colon
## 2048 1.0986123 0.375 0 0
## 6951 0.6931472 0.250 0 0
## H.P.daily.clip.report H.P.fashion.week H.P.first.draft
## 2048 0 0 0
## 6951 0 0 0
## H.P.facts.figures H.P.friday.night.music H.P.no.comment.colon
## 2048 0 0 0
## 6951 0 0 0
## H.P.on.this.day H.P.quandary H.P.readers.respond H.P.recap.colon
## 2048 0 0 0 0
## 6951 0 0 0 0
## H.P.s.notebook H.P.today.in.politic H.P.today.in.smallbusiness
## 2048 0 0 0
## 6951 0 0 0
## H.P.verbatim.colon H.P.what.we.are S.T.appear S.T.archiv S.T.diari
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## S.T.herald S.T.obama S.T.photo S.T.senat S.T.tribun S.T.word
## 2048 0 0 0 0 0 0
## 6951 0 0 0 0 0 0
## S.nwrds.log S.nwrds.unq.log S.sum.TfIdf S.ratio.sum.TfIdf.nwrds
## 2048 3.526361 2.833213 7.718332 0.2338888
## 6951 3.555348 2.833213 7.830238 0.2303011
## S.nchrs.log S.nuppr.log S.ndgts.log S.npnct01.log S.npnct02.log
## 2048 5.204007 1.609438 0 0 0
## 6951 5.247024 1.609438 0 0 0
## S.npnct03.log S.npnct04.log S.npnct05.log S.npnct06.log S.npnct07.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## S.npnct08.log S.npnct09.log S.npnct10.log S.npnct11.log S.npnct12.log
## 2048 0 0 0 0.6931472 0
## 6951 0 0 0 0.6931472 0
## S.npnct13.log S.npnct14.log S.npnct15.log S.npnct16.log S.npnct17.log
## 2048 0.6931472 0 0 0 0
## 6951 0.6931472 0 0 0 0
## S.npnct18.log S.npnct19.log S.npnct20.log S.npnct21.log S.npnct22.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## S.npnct23.log S.npnct24.log S.npnct25.log S.npnct26.log S.npnct27.log
## 2048 0 0.6931472 0 0 0
## 6951 0 0.6931472 0 0 0
## S.npnct28.log S.npnct29.log S.npnct30.log S.nstopwrds.log
## 2048 0 0 0 2.833213
## 6951 0 0 0 2.772589
## S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon S.P.daily.clip.report
## 2048 0.5000000 0 0 0
## 6951 0.4571429 0 0 0
## S.P.fashion.week S.P.first.draft S.P.metropolitan.diary.colon
## 2048 0 0 0
## 6951 0 0 0
## A.T.appear A.T.archiv A.T.diari A.T.herald A.T.obama A.T.photo
## 2048 0 0 0 0 0 0
## 6951 0 0 0 0 0 0
## A.T.senat A.T.tribun A.T.word A.nwrds.log A.nwrds.unq.log A.sum.TfIdf
## 2048 0 0 0 3.526361 2.833213 7.716210
## 6951 0 0 0 3.555348 2.833213 7.828007
## A.ratio.sum.TfIdf.nwrds A.nchrs.log A.nuppr.log A.ndgts.log
## 2048 0.2338245 5.204007 1.609438 0
## 6951 0.2302355 5.247024 1.609438 0
## A.npnct01.log A.npnct02.log A.npnct03.log A.npnct04.log A.npnct05.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## A.npnct06.log A.npnct07.log A.npnct08.log A.npnct09.log A.npnct10.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## A.npnct11.log A.npnct12.log A.npnct13.log A.npnct14.log A.npnct15.log
## 2048 0.6931472 0 0.6931472 0 0
## 6951 0.6931472 0 0.6931472 0 0
## A.npnct16.log A.npnct17.log A.npnct18.log A.npnct19.log A.npnct20.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## A.npnct21.log A.npnct22.log A.npnct23.log A.npnct24.log A.npnct25.log
## 2048 0 0 0 0.6931472 0
## 6951 0 0 0 0.6931472 0
## A.npnct26.log A.npnct27.log A.npnct28.log A.npnct29.log A.npnct30.log
## 2048 0 0 0 0 0
## 6951 0 0 0 0 0
## A.nstopwrds.log A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon
## 2048 2.833213 0.5000000 0 0
## 6951 2.772589 0.4571429 0 0
## A.P.daily.clip.report A.P.fashion.week A.P.first.draft
## 2048 0 0 0
## 6951 0 0 0
## A.P.metropolitan.diary.colon
## 2048 0
## 6951 0
## [1] "max distance(0.0735) pair:"
## UniqueID Popular myCategory
## 3125 3125 1 Science#Health#
## 4217 4217 0 Science#Health#
## Headline H.T.X2014 H.T.X2015
## 3125 A Leap in Lifespans 0 0
## 4217 Ask Well: Ebola Testing for All New Arrivals? 0 0
## H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china H.T.daili
## 3125 0 0 0 0 0 0 0
## 4217 0 0 0 0 0 0 0
## H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn H.T.new
## 3125 0 0 0 0 0 0 0.0000000
## 4217 0 0 0 0 0 0 0.8012831
## H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit H.T.report
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## H.T.say H.T.springsumm H.T.take H.T.test H.T.today H.T.week
## 3125 0 0 0 0.000000 0 0
## 4217 0 0 0 1.025646 0 0
## S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 3125 0 0 0 0 0 0 0
## 4217 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 3125 0 0 0 0 0 0 0
## 4217 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 3125 0 0 0 0 0 0 0
## 4217 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 3125 0.000000 0 0 0 0 1.609438
## 4217 1.096181 0 0 0 0 2.197225
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 3125 1.098612 12.036517 3.0091293 2.995732
## 4217 1.945910 7.294388 0.9117985 3.828641
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 3125 1.386294 0 0 0 0
## 4217 2.079442 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 3125 0 0.0000000 0 0 0
## 4217 0 0.6931472 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 3125 0.0000000 0 0 0 0
## 4217 0.6931472 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 3125 0.6931472 0 0 0 0
## 4217 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 3125 0 0 0.6931472 0.4000000
## 4217 0 0 0.6931472 0.2222222
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 3125 0 0 0 0
## 4217 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 3125 0 0 0
## 4217 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 3125 0 0 0 0
## 4217 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 3125 0 0 0
## 4217 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 3125 0 0 0
## 4217 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 3125 0 0 0 2.302585 1.791759 7.546173
## 4217 0 0 0 3.218876 2.708050 7.982366
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 3125 0.8384636 3.970292 0.6931472 0
## 4217 0.3325986 4.955827 1.9459101 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 3125 0.6931472 0 0.6931472 0 0.0000000
## 4217 0.0000000 0 0.0000000 0 0.6931472
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 3125 0 0 0 0.000000 0
## 4217 0 0 0 1.098612 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 3125 0 0 0 0.6931472 0
## 4217 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 3125 1.386294 0.40 0 0
## 4217 2.197225 0.36 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 3125 0 0 0
## 4217 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 3125 0 0 0 0
## 4217 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 3125 0 0 0 0 0 0
## 4217 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 3125 2.302585 1.791759 7.546173 0.8384636
## 4217 3.218876 2.708050 7.981215 0.3325506
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 3125 3.970292 0.6931472 0 0 0
## 4217 4.955827 1.9459101 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 3125 0 0 0 0 0
## 4217 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 3125 0 0 0 0.6931472 0
## 4217 0 0 0 0.0000000 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 3125 0.6931472 0 0.0000000 0 0
## 4217 0.0000000 0 0.6931472 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 3125 0 0.000000 0 0 0
## 4217 0 1.098612 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 3125 0 0.6931472 0 0 0
## 4217 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 3125 0 0 0 1.386294
## 4217 0 0 0 2.197225
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 3125 0.40 0 0 0
## 4217 0.36 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 3125 0 0 0
## 4217 0 0 0
## [1] "min distance(0.0005) pair:"
## UniqueID Popular myCategory Headline
## 5786 5786 1 Science#Health# An Incipient Threat to Our Hearts
## 5729 5729 0 Science#Health# Obesity Is Tied to Pollutants
## H.T.X2014 H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## H.T.china H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## H.T.morn H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 5786 0 0 0 0 0 0
## 5729 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 5786 0 0 0.2345898 0 0 0
## 5729 0 0 0.2199280 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 5786 0 0 0 0 0.2344587 0
## 5729 0 0 0 0 0.2198051 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 5786 0 0 0 0 0 0
## 5729 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 5786 0 0 0 0 0 0 0
## 5729 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 5786 0 0 0 0 0 1.945910
## 5729 0 0 0 0 0 1.791759
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 5786 1.386294 10.688386 1.781398 3.526361
## 5729 1.386294 9.905745 1.981149 3.401197
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 5786 1.791759 0 0 0 0
## 5729 1.609438 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 5786 0.6931472 0 0 0 0
## 5729 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 5786 0 0 0.6931472 0.2857143
## 5729 0 0 0.6931472 0.3333333
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 5786 0 0 0 0
## 5729 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 5786 0 0 0
## 5729 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 5786 0 0 0 0
## 5729 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 5786 0 0 0
## 5729 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 5786 0 0 0
## 5729 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 5786 0 0 0 0 0 0
## 5729 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 5786 0 0 0 3.178054 2.772589 9.418882
## 5729 0 0 0 3.178054 2.833213 8.823994
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 5786 0.4095166 5.159055 0.6931472 0
## 5729 0.3836519 4.948760 0.6931472 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 5786 0.6931472 0 0.6931472 0 0
## 5729 0.6931472 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 5786 0 0 0 0.6931472 0
## 5729 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 5786 2.079442 0.3333333 0 0
## 5729 2.079442 0.3333333 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 5786 0 0 0
## 5729 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 5786 0 0 0 0
## 5729 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 5786 0 0 0 0 0 0
## 5729 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 5786 3.178054 2.772589 9.416186 0.4093994
## 5729 3.178054 2.833213 8.822451 0.3835848
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 5786 5.159055 0.6931472 0 0 0
## 5729 4.948760 0.6931472 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 5786 0 0 0 0.6931472 0
## 5729 0 0 0 0.6931472 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 5786 0.6931472 0 0 0 0
## 5729 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 5786 0 0 0 0 0
## 5729 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 5786 0 0.6931472 0 0 0
## 5729 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 5786 0 0 0 2.079442
## 5729 0 0 0 2.079442
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 5786 0.3333333 0 0 0
## 5729 0.3333333 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 5786 0 0 0
## 5729 0 0 0
## [1] "max distance(0.1177) pair:"
## UniqueID Popular myCategory
## 164 164 0 Culture#Arts#
## 4232 4232 1 Culture#Arts#
## Headline H.T.X2014
## 164 Dan O'Brien and Suzan-Lori Parks Win Horton Foote Prize 0
## 4232 'The Good Wife' Recap: How to Make Will Gardner Go Away 0
## H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china
## 164 0 0 0 0 0 0 0
## 4232 0 0 0 0 0 0 0
## H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn
## 164 0 0 0 0 0 0.0000000 0
## 4232 0 0 0 0 0 0.9071452 0
## H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit
## 164 0 0 0 0 0 0
## 4232 0 0 0 0 0 0
## H.T.report H.T.say H.T.springsumm H.T.take H.T.test H.T.today
## 164 0 0 0 0 0 0
## 4232 0 0 0 0 0 0
## H.T.week S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 164 0 0 0 0 0 0 0
## 4232 0 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 164 0 0 0.3518848 0 0 0
## 4232 0 0 0.0000000 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 164 0 0 0 0 0 0 0
## 4232 0 0 0 0 0 0 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 164 0 0 0 0 0 0 0
## 4232 0 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 164 0 0 0 0 0.3516881 0
## 4232 0 0 0 0 0.0000000 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 164 0 0 0 0 0 0
## 4232 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 164 0 0 0 0 0 0 0
## 4232 0 0 0 0 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 164 0 0 0 0 0 2.484907
## 4232 0 0 0 0 0 2.484907
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 164 2.197225 10.74563 0.9768753 4.025352
## 4232 2.079442 7.85657 0.7142337 4.025352
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 164 2.397895 0 0 0 0
## 4232 2.397895 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 164 0 0 0 0.6931472 0
## 4232 0 0 0 1.0986123 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 164 0 0 0 0.6931472 0
## 4232 0 0 0 0.0000000 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 164 0 0.0000000 0 0 0
## 4232 0 0.6931472 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 164 0 0 0 0 0
## 4232 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 164 0.6931472 0 0 0 0
## 4232 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 164 0 0 0.6931472 0.1666667
## 4232 0 0 0.6931472 0.1666667
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 164 0 0 0 0
## 4232 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 164 0 0 0
## 4232 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 164 0 0 0 0
## 4232 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 164 0 0 0
## 4232 1 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 164 0 0 0
## 4232 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 164 0 0 0 0 0 0
## 4232 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 164 0 0 0 3.610918 2.833213 7.025941
## 4232 0 0 0 1.791759 1.386294 10.649893
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 164 0.195165 5.323010 2.833213 1.386294
## 4232 2.129979 3.218876 1.386294 0.000000
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 164 0.0000000 0 0 0 0
## 4232 0.6931472 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 164 0.6931472 0 1.098612 0 0
## 4232 0.0000000 0 0.000000 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 164 1.386294 0 1.386294 0 0
## 4232 0.000000 0 0.000000 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 164 0 0 0 0 0
## 4232 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 164 0 0 0 0.6931472 0
## 4232 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 164 0 0 0 0 0
## 4232 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 164 2.3025851 0.2702703 0 0
## 4232 0.6931472 0.3333333 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 164 0 0 0
## 4232 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 164 0 0 0 0
## 4232 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 164 0 0 0 0 0 0
## 4232 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 164 3.610918 2.833213 7.023816 0.195106
## 4232 1.791759 1.386294 10.643214 2.128643
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 164 5.323010 2.833213 1.386294 0.0000000 0
## 4232 3.218876 1.386294 0.000000 0.6931472 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 164 0 0 0 0.6931472 0
## 4232 0 0 0 0.0000000 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 164 1.098612 0 0 1.386294 0
## 4232 0.000000 0 0 0.000000 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 164 1.386294 0 0 0 0
## 4232 0.000000 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 164 0 0 0 0 0
## 4232 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 164 0 0.6931472 0 0 0
## 4232 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 164 0 0 0 2.3025851
## 4232 0 0 0 0.6931472
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 164 0.2702703 0 0 0
## 4232 0.3333333 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 164 0 0 0
## 4232 0 0 0
## [1] "min distance(0.0000) pair:"
## UniqueID Popular myCategory
## 376 376 0 Culture#Arts#
## 901 901 0 Culture#Arts#
## Headline H.T.X2014
## 376 International Arts Events Happening in the Week Ahead 0
## 901 International Arts Events Happening in the Week Ahead 0
## H.T.X2015 H.T.art H.T.bank H.T.big H.T.billion H.T.busi H.T.china
## 376 0 1.049175 0 0 0 0 0
## 901 0 1.049175 0 0 0 0 0
## H.T.daili H.T.day H.T.deal H.T.fashion H.T.first H.T.make H.T.morn
## 376 0 0 0 0 0 0 0
## 901 0 0 0 0 0 0 0
## H.T.new H.T.news H.T.newyork H.T.obama H.T.pictur H.T.polit H.T.report
## 376 0 0 0 0 0 0 0
## 901 0 0 0 0 0 0 0
## H.T.say H.T.springsumm H.T.take H.T.test H.T.today H.T.week
## 376 0 0 0 0 0 0.8266169
## 901 0 0 0 0 0 0.8266169
## S.T.articl S.T.can S.T.compani S.T.day S.T.fashion S.T.first
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## S.T.intern S.T.make S.T.new S.T.newyork S.T.newyorktim S.T.one
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## S.T.presid S.T.report S.T.said S.T.share S.T.show S.T.take S.T.time
## 376 0 0 0 0 0 0.552869 0
## 901 0 0 0 0 0 0.552869 0
## S.T.week S.T.will S.T.year A.T.articl A.T.can A.T.compani A.T.day
## 376 0.4376094 0 0 0 0 0 0
## 901 0.4376094 0 0 0 0 0 0
## A.T.fashion A.T.first A.T.intern A.T.make A.T.new A.T.newyork
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## A.T.newyorktim A.T.one A.T.presid A.T.report A.T.said A.T.share
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## A.T.show A.T.take A.T.time A.T.week A.T.will A.T.year H.T.clip
## 376 0 0.5522698 0 0.4376094 0 0 0
## 901 0 0.5522698 0 0.4376094 0 0 0
## H.T.ebola H.T.get H.T.newyorktim H.T.read H.T.word H.nwrds.log
## 376 0 0 0 0 0 2.197225
## 901 0 0 0 0 0 2.197225
## H.nwrds.unq.log H.sum.TfIdf H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 376 1.94591 7.508951 0.9386189 3.988984
## 901 1.94591 7.508951 0.9386189 3.988984
## H.nuppr.log H.ndgts.log H.npnct01.log H.npnct02.log H.npnct03.log
## 376 1.94591 0 0 0 0
## 901 1.94591 0 0 0 0
## H.npnct04.log H.npnct05.log H.npnct06.log H.npnct07.log H.npnct08.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## H.npnct09.log H.npnct10.log H.npnct11.log H.npnct12.log H.npnct13.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## H.npnct14.log H.npnct15.log H.npnct16.log H.npnct17.log H.npnct18.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## H.npnct19.log H.npnct20.log H.npnct21.log H.npnct22.log H.npnct23.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## H.npnct24.log H.npnct25.log H.npnct26.log H.npnct27.log H.npnct28.log
## 376 0.6931472 0 0 0 0
## 901 0.6931472 0 0 0 0
## H.npnct29.log H.npnct30.log H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 376 0 0 1.098612 0.3333333
## 901 0 0 1.098612 0.3333333
## H.P.http H.P.year.colon H.P.daily.clip.report H.P.fashion.week
## 376 0 0 0 0
## 901 0 0 0 0
## H.P.first.draft H.P.facts.figures H.P.friday.night.music
## 376 0 0 0
## 901 0 0 0
## H.P.no.comment.colon H.P.on.this.day H.P.quandary H.P.readers.respond
## 376 0 0 0 0
## 901 0 0 0 0
## H.P.recap.colon H.P.s.notebook H.P.today.in.politic
## 376 0 0 0
## 901 0 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon H.P.what.we.are
## 376 0 0 0
## 901 0 0 0
## S.T.appear S.T.archiv S.T.diari S.T.herald S.T.obama S.T.photo
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## S.T.senat S.T.tribun S.T.word S.nwrds.log S.nwrds.unq.log S.sum.TfIdf
## 376 0 0 0 2.639057 2.302585 6.163319
## 901 0 0 0 2.639057 2.302585 6.163319
## S.ratio.sum.TfIdf.nwrds S.nchrs.log S.nuppr.log S.ndgts.log
## 376 0.4741014 4.317488 0.6931472 0
## 901 0.4741014 4.317488 0.6931472 0
## S.npnct01.log S.npnct02.log S.npnct03.log S.npnct04.log S.npnct05.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## S.npnct06.log S.npnct07.log S.npnct08.log S.npnct09.log S.npnct10.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## S.npnct11.log S.npnct12.log S.npnct13.log S.npnct14.log S.npnct15.log
## 376 0 0 0.6931472 0 0
## 901 0 0 0.6931472 0 0
## S.npnct16.log S.npnct17.log S.npnct18.log S.npnct19.log S.npnct20.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## S.npnct21.log S.npnct22.log S.npnct23.log S.npnct24.log S.npnct25.log
## 376 0 0 0 0.6931472 0
## 901 0 0 0 0.6931472 0
## S.npnct26.log S.npnct27.log S.npnct28.log S.npnct29.log S.npnct30.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## S.nstopwrds.log S.ratio.nstopwrds.nwrds S.P.http S.P.year.colon
## 376 1.386294 0.2857143 0 0
## 901 1.386294 0.2857143 0 0
## S.P.daily.clip.report S.P.fashion.week S.P.first.draft
## 376 0 0 0
## 901 0 0 0
## S.P.metropolitan.diary.colon A.T.appear A.T.archiv A.T.diari
## 376 0 0 0 0
## 901 0 0 0 0
## A.T.herald A.T.obama A.T.photo A.T.senat A.T.tribun A.T.word
## 376 0 0 0 0 0 0
## 901 0 0 0 0 0 0
## A.nwrds.log A.nwrds.unq.log A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 376 2.639057 2.302585 6.156981 0.4736139
## 901 2.639057 2.302585 6.156981 0.4736139
## A.nchrs.log A.nuppr.log A.ndgts.log A.npnct01.log A.npnct02.log
## 376 4.317488 0.6931472 0 0 0
## 901 4.317488 0.6931472 0 0 0
## A.npnct03.log A.npnct04.log A.npnct05.log A.npnct06.log A.npnct07.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## A.npnct08.log A.npnct09.log A.npnct10.log A.npnct11.log A.npnct12.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## A.npnct13.log A.npnct14.log A.npnct15.log A.npnct16.log A.npnct17.log
## 376 0.6931472 0 0 0 0
## 901 0.6931472 0 0 0 0
## A.npnct18.log A.npnct19.log A.npnct20.log A.npnct21.log A.npnct22.log
## 376 0 0 0 0 0
## 901 0 0 0 0 0
## A.npnct23.log A.npnct24.log A.npnct25.log A.npnct26.log A.npnct27.log
## 376 0 0.6931472 0 0 0
## 901 0 0.6931472 0 0 0
## A.npnct28.log A.npnct29.log A.npnct30.log A.nstopwrds.log
## 376 0 0 0 1.386294
## 901 0 0 0 1.386294
## A.ratio.nstopwrds.nwrds A.P.http A.P.year.colon A.P.daily.clip.report
## 376 0.2857143 0 0 0
## 901 0.2857143 0 0 0
## A.P.fashion.week A.P.first.draft A.P.metropolitan.diary.colon
## 376 0 0 0
## 901 0 0 0
## myCategory .clusterid N Y NA
## 43 Business#Crosswords/Games# 1 20 103 42
## 70 OpEd#Opinion# 1101 28 90 37
## 79 Science#Health# 101 36 75 29
## 71 OpEd#Opinion# 1102 15 74 22
## 72 OpEd#Opinion# 1103 10 56 18
## 73 OpEd#Opinion# 1104 13 49 20
## 84 Styles#U.S.# 1801 35 44 21
## 74 OpEd#Opinion# 1105 14 38 17
## 75 OpEd#Opinion# 1106 8 34 16
## 85 Styles#U.S.# 1802 22 32 23
## 29 Business#Business Day#Dealbook 501 117 29 49
## 80 Science#Health# 102 21 28 15
## 77 OpEd#Opinion# 1108 10 28 8
## 53 Culture#Arts# 403 46 21 23
## 76 OpEd#Opinion# 1107 16 21 17
## 86 Styles#U.S.# 1803 8 18 14
## 78 OpEd#Opinion# 1109 1 18 9
## 68 Metro#N.Y. / Region# 1 181 17 67
## 27 #Opinion#The Public Editor 1 4 16 10
## 3 ## 1503 79 13 19
## 49 Business#Technology# 706 29 13 10
## 81 Science#Health# 103 10 11 6
## 44 Business#Technology# 701 50 10 25
## 21 ## 1521 29 10 3
## 89 TStyle## 1 715 9 105
## 34 Business#Business Day#Dealbook 506 45 9 22
## 45 Business#Technology# 702 44 9 19
## 48 Business#Technology# 705 42 9 14
## 16 ## 1516 31 9 9
## 59 Culture#Arts# 409 26 8 23
## 9 ## 1509 50 8 16
## 38 Business#Business Day#Dealbook 510 38 8 7
## 82 Science#Health# 104 7 8 7
## 30 Business#Business Day#Dealbook 502 139 7 39
## 31 Business#Business Day#Dealbook 503 128 7 38
## 26 #Opinion#Room For Debate 1 69 7 24
## 47 Business#Technology# 704 44 7 19
## 2 ## 1502 88 7 17
## 5 ## 1505 77 7 16
## 12 ## 1512 44 7 16
## 22 ## 1522 21 7 6
## 6 ## 1506 69 6 23
## 1 ## 1501 103 6 16
## 58 Culture#Arts# 408 42 6 13
## 17 ## 1517 30 6 11
## 87 Styles#U.S.# 1804 12 6 4
## 32 Business#Business Day#Dealbook 504 110 5 50
## 42 Business#Business Day#Small Business 1 135 5 42
## 4 ## 1504 74 4 28
## 54 Culture#Arts# 404 50 4 25
## 55 Culture#Arts# 405 54 4 21
## 33 Business#Business Day#Dealbook 505 70 4 18
## 36 Business#Business Day#Dealbook 508 37 4 16
## 35 Business#Business Day#Dealbook 507 48 4 14
## 10 ## 1510 54 4 11
## 19 ## 1519 33 4 10
## 67 Foreign#World#Asia Pacific 1 200 3 56
## 8 ## 1508 48 3 26
## 56 Culture#Arts# 406 52 3 22
## 7 ## 1507 68 3 21
## 13 ## 1513 38 3 21
## 37 Business#Business Day#Dealbook 509 34 3 17
## 40 Business#Business Day#Dealbook 512 35 3 11
## 41 Business#Business Day#Dealbook 513 27 3 10
## 25 #Multimedia# 1 139 2 52
## 51 Culture#Arts# 401 88 2 25
## 39 Business#Business Day#Dealbook 511 36 2 13
## 50 Business#Technology# 707 25 2 10
## 64 Culture#Arts# 414 16 2 9
## 88 Travel#Travel# 1 116 1 35
## 15 ## 1515 32 1 16
## 46 Business#Technology# 703 54 1 16
## 83 Styles##Fashion 1 118 1 15
## 18 ## 1518 40 1 6
## 24 ## 1524 13 1 6
## 28 #U.S.#Education 1 325 0 90
## 66 Foreign#World# 1 172 0 47
## 11 ## 1511 47 0 21
## 52 Culture#Arts# 402 74 0 20
## 61 Culture#Arts# 411 23 0 20
## 57 Culture#Arts# 407 56 0 15
## 60 Culture#Arts# 410 32 0 13
## 14 ## 1514 41 0 10
## 20 ## 1520 36 0 7
## 63 Culture#Arts# 413 23 0 6
## 65 Culture#Arts# 415 17 0 5
## 62 Culture#Arts# 412 27 0 4
## 23 ## 1523 18 0 3
## 69 myOther 1 38 0 3
## , , = N
##
##
## 1 101 102 103 104 401 402 403 404
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 139 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 69 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 4 0 0 0 0 0 0 0 0
## #U.S.#Education 325 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 135 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 20 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 88 74 46 50
## Foreign#World# 172 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 200 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 181 0 0 0 0 0 0 0 0
## myOther 38 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 36 21 10 7 0 0 0 0
## Styles##Fashion 118 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 116 0 0 0 0 0 0 0 0
## TStyle## 715 0 0 0 0 0 0 0 0
##
## 405 406 407 408 409 410 411 412 413
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 54 52 56 42 26 32 23 27 23
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 414 415 501 502 503 504 505 506 507
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 117 139 128 110 70 45 48
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 16 17 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 508 509 510 511 512 513 701 702 703
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 37 34 38 36 35 27 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 50 44 54
## Culture#Arts# 0 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 704 705 706 707 1101 1102 1103 1104
## ## 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0
## Business#Technology# 44 42 29 25 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 28 15 10 13
## Science#Health# 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0
##
## 1105 1106 1107 1108 1109 1501 1502
## ## 0 0 0 0 0 103 88
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 14 8 16 10 1 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1503 1504 1505 1506 1507 1508 1509
## ## 79 74 77 69 68 48 50
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1510 1511 1512 1513 1514 1515 1516
## ## 54 47 44 38 41 32 31
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1517 1518 1519 1520 1521 1522 1523
## ## 30 40 33 36 29 21 18
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1524 1801 1802 1803 1804
## ## 13 0 0 0 0
## #Multimedia# 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0
## Business#Technology# 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0
## Foreign#World# 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0
## myOther 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0
## Science#Health# 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0
## Styles#U.S.# 0 35 22 8 12
## Travel#Travel# 0 0 0 0 0
## TStyle## 0 0 0 0 0
##
## , , = Y
##
##
## 1 101 102 103 104 401 402 403 404
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 2 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 7 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 16 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 5 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 103 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 2 0 21 4
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 3 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 17 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 75 28 11 8 0 0 0 0
## Styles##Fashion 1 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 1 0 0 0 0 0 0 0 0
## TStyle## 9 0 0 0 0 0 0 0 0
##
## 405 406 407 408 409 410 411 412 413
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 4 3 0 6 8 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 414 415 501 502 503 504 505 506 507
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 29 7 7 5 4 9 4
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 2 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 508 509 510 511 512 513 701 702 703
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 4 3 8 2 3 3 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 10 9 1
## Culture#Arts# 0 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 704 705 706 707 1101 1102 1103 1104
## ## 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0
## Business#Technology# 7 9 13 2 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 90 74 56 49
## Science#Health# 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0
##
## 1105 1106 1107 1108 1109 1501 1502
## ## 0 0 0 0 0 6 7
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 38 34 21 28 18 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1503 1504 1505 1506 1507 1508 1509
## ## 13 4 7 6 3 3 8
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1510 1511 1512 1513 1514 1515 1516
## ## 4 0 7 3 0 1 9
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1517 1518 1519 1520 1521 1522 1523
## ## 6 1 4 0 10 7 0
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1524 1801 1802 1803 1804
## ## 1 0 0 0 0
## #Multimedia# 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0
## Business#Technology# 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0
## Foreign#World# 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0
## myOther 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0
## Science#Health# 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0
## Styles#U.S.# 0 44 32 18 6
## Travel#Travel# 0 0 0 0 0
## TStyle## 0 0 0 0 0
##
## , , = NA
##
##
## 1 101 102 103 104 401 402 403 404
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 52 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 24 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 10 0 0 0 0 0 0 0 0
## #U.S.#Education 90 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 42 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 42 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 25 20 23 25
## Foreign#World# 47 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 56 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 67 0 0 0 0 0 0 0 0
## myOther 3 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 29 15 6 7 0 0 0 0
## Styles##Fashion 15 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 35 0 0 0 0 0 0 0 0
## TStyle## 105 0 0 0 0 0 0 0 0
##
## 405 406 407 408 409 410 411 412 413
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 21 22 15 13 23 13 20 4 6
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 414 415 501 502 503 504 505 506 507
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 49 39 38 50 18 22 14
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0 0 0
## Culture#Arts# 9 5 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 508 509 510 511 512 513 701 702 703
## ## 0 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 16 17 7 13 11 10 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 25 19 16
## Culture#Arts# 0 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0 0
##
## 704 705 706 707 1101 1102 1103 1104
## ## 0 0 0 0 0 0 0 0
## #Multimedia# 0 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0 0
## Business#Technology# 19 14 10 10 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 37 22 18 20
## Science#Health# 0 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0 0
##
## 1105 1106 1107 1108 1109 1501 1502
## ## 0 0 0 0 0 16 17
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 17 16 17 8 9 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1503 1504 1505 1506 1507 1508 1509
## ## 19 28 16 23 21 26 16
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1510 1511 1512 1513 1514 1515 1516
## ## 11 21 16 21 10 16 9
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1517 1518 1519 1520 1521 1522 1523
## ## 11 6 10 7 3 6 3
## #Multimedia# 0 0 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0 0 0
## Business#Technology# 0 0 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0 0 0
## Foreign#World# 0 0 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0 0 0
## myOther 0 0 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0 0 0
## Science#Health# 0 0 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0 0 0
## Styles#U.S.# 0 0 0 0 0 0 0
## Travel#Travel# 0 0 0 0 0 0 0
## TStyle## 0 0 0 0 0 0 0
##
## 1524 1801 1802 1803 1804
## ## 6 0 0 0 0
## #Multimedia# 0 0 0 0 0
## #Opinion#Room For Debate 0 0 0 0 0
## #Opinion#The Public Editor 0 0 0 0 0
## #U.S.#Education 0 0 0 0 0
## Business#Business Day#Dealbook 0 0 0 0 0
## Business#Business Day#Small Business 0 0 0 0 0
## Business#Crosswords/Games# 0 0 0 0 0
## Business#Technology# 0 0 0 0 0
## Culture#Arts# 0 0 0 0 0
## Foreign#World# 0 0 0 0 0
## Foreign#World#Asia Pacific 0 0 0 0 0
## Metro#N.Y. / Region# 0 0 0 0 0
## myOther 0 0 0 0 0
## OpEd#Opinion# 0 0 0 0 0
## Science#Health# 0 0 0 0 0
## Styles##Fashion 0 0 0 0 0
## Styles#U.S.# 0 21 23 14 4
## Travel#Travel# 0 0 0 0 0
## TStyle## 0 0 0 0 0
##
## [1] UniqueID Popular myCategory Headline .clusterid
## <0 rows> (or 0-length row.names)
# Re-partition
glb_trnobs_df <- subset(glb_allobs_df, .src == "Train")
glb_newobs_df <- subset(glb_allobs_df, .src == "Test")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "select.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 7 cluster.data 4 0 213.686 235.859 22.173
## 8 select.features 5 0 235.859 NA NA
5.0: select featuresprint(glb_feats_df <- myselect_features(entity_df=glb_trnobs_df,
exclude_vars_as_features=glb_exclude_vars_as_features,
rsp_var=glb_rsp_var))
## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y
## Popular Popular 1.000000e+00
## A.nuppr.log A.nuppr.log -2.720962e-01
## S.nuppr.log S.nuppr.log -2.718459e-01
## WordCount.log WordCount.log 2.656836e-01
## A.ratio.sum.TfIdf.nwrds A.ratio.sum.TfIdf.nwrds 2.623865e-01
## S.ratio.sum.TfIdf.nwrds S.ratio.sum.TfIdf.nwrds 2.622549e-01
## WordCount WordCount 2.575265e-01
## S.nwrds.unq.log S.nwrds.unq.log -2.461670e-01
## A.nwrds.unq.log A.nwrds.unq.log -2.460117e-01
## H.ratio.sum.TfIdf.nwrds H.ratio.sum.TfIdf.nwrds 2.254527e-01
## S.nchrs.log S.nchrs.log -2.246930e-01
## A.nchrs.log A.nchrs.log -2.245488e-01
## H.nwrds.unq.log H.nwrds.unq.log -2.014127e-01
## A.nwrds.log A.nwrds.log -1.978712e-01
## S.nwrds.log S.nwrds.log -1.978341e-01
## .clusterid .clusterid 1.820567e-01
## .clusterid.fctr .clusterid.fctr 1.813987e-01
## H.nchrs.log H.nchrs.log -1.710624e-01
## H.nwrds.log H.nwrds.log -1.573431e-01
## H.sum.TfIdf H.sum.TfIdf 1.520414e-01
## S.sum.TfIdf S.sum.TfIdf 1.484963e-01
## A.sum.TfIdf A.sum.TfIdf 1.478461e-01
## PubDate.hour.fctr PubDate.hour.fctr 1.354368e-01
## H.npnct19.log H.npnct19.log 1.283641e-01
## H.nuppr.log H.nuppr.log -1.278085e-01
## A.ndgts.log A.ndgts.log -1.249484e-01
## S.ndgts.log S.ndgts.log -1.242046e-01
## A.ratio.nstopwrds.nwrds A.ratio.nstopwrds.nwrds 1.213545e-01
## S.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.206896e-01
## H.ndgts.log H.ndgts.log -1.196633e-01
## A.nstopwrds.log A.nstopwrds.log -1.153879e-01
## S.nstopwrds.log S.nstopwrds.log -1.148150e-01
## PubDate.wkend PubDate.wkend 1.067288e-01
## A.npnct11.log A.npnct11.log -9.183870e-02
## S.npnct11.log S.npnct11.log -9.158156e-02
## H.P.recap.colon H.P.recap.colon 9.008096e-02
## H.npnct28.log H.npnct28.log -8.917338e-02
## H.P.quandary H.P.quandary 8.734922e-02
## H.nstopwrds.log H.nstopwrds.log -8.657067e-02
## S.T.week S.T.week -8.552704e-02
## A.T.week A.T.week -8.542792e-02
## S.T.fashion S.T.fashion -8.417159e-02
## A.T.fashion A.T.fashion -8.416793e-02
## H.npnct15.log H.npnct15.log -8.273237e-02
## H.T.fashion H.T.fashion -7.947505e-02
## H.P.year.colon H.P.year.colon -7.842875e-02
## H.P.fashion.week H.P.fashion.week -7.632046e-02
## S.T.archiv S.T.archiv -7.202808e-02
## A.T.archiv A.T.archiv -7.202808e-02
## S.P.fashion.week S.P.fashion.week -7.080716e-02
## A.P.fashion.week A.P.fashion.week -7.080716e-02
## S.T.tribun S.T.tribun -7.013418e-02
## A.T.tribun A.T.tribun -7.013418e-02
## S.T.intern S.T.intern -6.956906e-02
## A.T.intern A.T.intern -6.953025e-02
## A.npnct15.log A.npnct15.log -6.893301e-02
## S.T.photo S.T.photo -6.874283e-02
## A.T.photo A.T.photo -6.873838e-02
## H.T.week H.T.week -6.827601e-02
## S.npnct15.log S.npnct15.log -6.770952e-02
## S.T.herald S.T.herald -6.752419e-02
## A.T.herald A.T.herald -6.752419e-02
## H.T.X2015 H.T.X2015 -6.601141e-02
## H.T.daili H.T.daili -6.303731e-02
## S.npnct04.log S.npnct04.log -6.294642e-02
## A.npnct04.log A.npnct04.log -6.294642e-02
## H.T.report H.T.report -6.238114e-02
## S.T.diari S.T.diari -6.229931e-02
## A.T.diari A.T.diari -6.229931e-02
## H.npnct14.log H.npnct14.log -6.158577e-02
## H.P.no.comment.colon H.P.no.comment.colon 6.074669e-02
## H.T.day H.T.day -6.044381e-02
## H.T.springsumm H.T.springsumm -5.943248e-02
## H.T.today H.T.today -5.833786e-02
## H.T.newyork H.T.newyork -5.564999e-02
## S.npnct19.log S.npnct19.log 5.503894e-02
## A.npnct19.log A.npnct19.log 5.482747e-02
## S.T.articl S.T.articl -5.471737e-02
## A.T.articl A.T.articl -5.470831e-02
## H.P.facts.figures H.P.facts.figures 5.410097e-02
## PubDate.last10 PubDate.last10 5.398093e-02
## H.npnct08.log H.npnct08.log 5.375262e-02
## S.npnct13.log S.npnct13.log -5.332519e-02
## H.npnct04.log H.npnct04.log -5.126277e-02
## S.T.share S.T.share -5.105597e-02
## A.T.share A.T.share -5.105597e-02
## A.npnct13.log A.npnct13.log -4.999563e-02
## S.T.newyorktim S.T.newyorktim -4.985328e-02
## A.T.newyorktim A.T.newyorktim -4.984782e-02
## PubDate.last10.log PubDate.last10.log 4.931702e-02
## H.T.busi H.T.busi -4.899819e-02
## H.T.morn H.T.morn -4.838380e-02
## S.T.word S.T.word -4.822452e-02
## A.T.word A.T.word -4.821561e-02
## S.T.compani S.T.compani -4.787994e-02
## S.T.report S.T.report -4.779877e-02
## A.T.compani A.T.compani -4.774812e-02
## A.T.report A.T.report -4.774593e-02
## S.T.newyork S.T.newyork -4.694998e-02
## A.T.newyork A.T.newyork -4.686921e-02
## PubDate.last1.log PubDate.last1.log 4.635751e-02
## H.T.X2014 H.T.X2014 -4.523858e-02
## H.T.first H.T.first -4.472902e-02
## S.T.first S.T.first -4.447317e-02
## H.T.news H.T.news -4.436368e-02
## A.T.first A.T.first -4.433630e-02
## H.P.readers.respond H.P.readers.respond 4.432886e-02
## H.T.clip H.T.clip -4.388279e-02
## H.P.daily.clip.report H.P.daily.clip.report -4.388279e-02
## S.P.daily.clip.report S.P.daily.clip.report -4.388279e-02
## A.P.daily.clip.report A.P.daily.clip.report -4.388279e-02
## A.npnct28.log A.npnct28.log -4.373349e-02
## S.npnct28.log S.npnct28.log -4.370037e-02
## H.P.first.draft H.P.first.draft -4.316253e-02
## A.T.day A.T.day -4.270831e-02
## S.T.day S.T.day -4.262213e-02
## H.P.today.in.smallbusiness H.P.today.in.smallbusiness -4.243051e-02
## A.T.show A.T.show -4.185292e-02
## S.T.show S.T.show -4.182920e-02
## S.T.senat S.T.senat -4.143422e-02
## A.T.senat A.T.senat -4.139980e-02
## A.T.make A.T.make 4.124187e-02
## S.T.make S.T.make 4.118050e-02
## H.T.new H.T.new -4.111696e-02
## H.ratio.nstopwrds.nwrds H.ratio.nstopwrds.nwrds 4.024406e-02
## H.T.pictur H.T.pictur -4.003882e-02
## PubDate.last100 PubDate.last100 3.989229e-02
## PubDate.wkday.fctr PubDate.wkday.fctr -3.980129e-02
## S.T.appear S.T.appear -3.941362e-02
## A.T.appear A.T.appear -3.941362e-02
## S.T.will S.T.will -3.888838e-02
## A.T.will A.T.will -3.884318e-02
## H.P.what.we.are H.P.what.we.are -3.775209e-02
## A.npnct12.log A.npnct12.log -3.760012e-02
## S.T.year S.T.year -3.756011e-02
## A.T.year A.T.year -3.741571e-02
## H.P.today.in.politic H.P.today.in.politic -3.733661e-02
## S.npnct12.log S.npnct12.log -3.638891e-02
## PubDate.last1 PubDate.last1 3.592267e-02
## H.T.read H.T.read -3.467043e-02
## PubDate.minute.fctr PubDate.minute.fctr -3.407385e-02
## H.T.get H.T.get 3.300192e-02
## H.T.art H.T.art -3.291486e-02
## H.T.china H.T.china -3.283653e-02
## H.P.verbatim.colon H.P.verbatim.colon -3.194363e-02
## H.npnct06.log H.npnct06.log 3.190718e-02
## S.npnct01.log S.npnct01.log 3.093101e-02
## A.npnct01.log A.npnct01.log 3.093101e-02
## A.T.can A.T.can 3.083389e-02
## H.T.polit H.T.polit -3.058564e-02
## H.npnct16.log H.npnct16.log 3.039622e-02
## S.T.can S.T.can 3.005998e-02
## S.P.metropolitan.diary.colon S.P.metropolitan.diary.colon -2.841404e-02
## A.P.metropolitan.diary.colon A.P.metropolitan.diary.colon -2.841404e-02
## H.T.billion H.T.billion -2.776561e-02
## S.npnct21.log S.npnct21.log 2.760321e-02
## S.npnct23.log S.npnct23.log 2.760321e-02
## H.T.ebola H.T.ebola 2.682920e-02
## A.T.new A.T.new -2.597887e-02
## S.T.new S.T.new -2.592872e-02
## H.T.deal H.T.deal -2.556237e-02
## H.npnct13.log H.npnct13.log -2.524770e-02
## H.T.newyorktim H.T.newyorktim -2.514415e-02
## A.T.time A.T.time -2.430509e-02
## S.T.time S.T.time -2.416246e-02
## A.npnct14.log A.npnct14.log -2.407715e-02
## S.npnct06.log S.npnct06.log -2.389145e-02
## A.npnct06.log A.npnct06.log -2.389145e-02
## A.T.take A.T.take -2.271897e-02
## H.npnct01.log H.npnct01.log 2.271577e-02
## S.T.take S.T.take -2.264447e-02
## H.P.on.this.day H.P.on.this.day -2.150663e-02
## S.P.first.draft S.P.first.draft -2.150663e-02
## A.P.first.draft A.P.first.draft -2.150663e-02
## S.npnct14.log S.npnct14.log -2.121844e-02
## H.T.test H.T.test -2.117852e-02
## H.npnct02.log H.npnct02.log -2.001851e-02
## S.npnct20.log S.npnct20.log -1.923169e-02
## A.npnct20.log A.npnct20.log -1.923169e-02
## A.T.obama A.T.obama -1.914924e-02
## PubDate.month.fctr PubDate.month.fctr 1.914874e-02
## S.T.obama S.T.obama -1.914281e-02
## A.T.said A.T.said 1.876762e-02
## S.T.said S.T.said 1.863436e-02
## S.P.year.colon S.P.year.colon -1.755336e-02
## A.P.year.colon A.P.year.colon -1.755336e-02
## PubDate.POSIX PubDate.POSIX 1.568326e-02
## PubDate.zoo PubDate.zoo 1.568326e-02
## A.npnct21.log A.npnct21.log 1.537569e-02
## A.npnct23.log A.npnct23.log 1.537569e-02
## A.npnct17.log A.npnct17.log -1.457558e-02
## A.npnct02.log A.npnct02.log -1.451467e-02
## H.T.big H.T.big -1.438162e-02
## H.T.word H.T.word -1.382927e-02
## A.npnct03.log A.npnct03.log -1.359260e-02
## H.T.make H.T.make 1.349595e-02
## H.npnct11.log H.npnct11.log 1.333613e-02
## H.npnct12.log H.npnct12.log -1.305305e-02
## A.P.http A.P.http -1.294748e-02
## A.npnct18.log A.npnct18.log -1.271661e-02
## S.npnct03.log S.npnct03.log -1.240734e-02
## myCategory.fctr myCategory.fctr 1.234541e-02
## S.npnct07.log S.npnct07.log -1.214357e-02
## A.npnct07.log A.npnct07.log -1.214357e-02
## H.npnct07.log H.npnct07.log -1.201741e-02
## PubDate.second.fctr PubDate.second.fctr -1.187946e-02
## UniqueID UniqueID 1.182492e-02
## PubDate.date.fctr PubDate.date.fctr -1.164756e-02
## A.T.one A.T.one 1.051414e-02
## S.T.one S.T.one 1.050293e-02
## H.T.bank H.T.bank -1.037439e-02
## H.T.obama H.T.obama -9.878461e-03
## H.T.say H.T.say -9.763205e-03
## H.P.friday.night.music H.P.friday.night.music -9.653967e-03
## H.npnct05.log H.npnct05.log -9.653967e-03
## H.npnct03.log H.npnct03.log 9.533020e-03
## .rnorm .rnorm -8.244230e-03
## H.P.s.notebook H.P.s.notebook 7.755542e-03
## PubDate.last100.log PubDate.last100.log -7.663322e-03
## H.npnct10.log H.npnct10.log -5.547032e-03
## H.npnct20.log H.npnct20.log -5.547032e-03
## S.npnct02.log S.npnct02.log -5.547032e-03
## S.npnct10.log S.npnct10.log -5.547032e-03
## A.npnct10.log A.npnct10.log -5.547032e-03
## A.npnct25.log A.npnct25.log -5.547032e-03
## A.npnct08.log A.npnct08.log -4.193476e-03
## S.npnct08.log S.npnct08.log -3.372706e-03
## S.T.presid S.T.presid -2.381159e-03
## A.T.presid A.T.presid -2.090565e-03
## S.npnct16.log S.npnct16.log -1.587454e-03
## A.npnct16.log A.npnct16.log -1.587454e-03
## H.T.take H.T.take -1.263270e-03
## H.npnct24.log H.npnct24.log -9.890046e-19
## S.npnct24.log S.npnct24.log -9.890046e-19
## A.npnct24.log A.npnct24.log -9.890046e-19
## H.npnct09.log H.npnct09.log NA
## H.npnct17.log H.npnct17.log NA
## H.npnct18.log H.npnct18.log NA
## H.npnct21.log H.npnct21.log NA
## H.npnct22.log H.npnct22.log NA
## H.npnct23.log H.npnct23.log NA
## H.npnct25.log H.npnct25.log NA
## H.npnct26.log H.npnct26.log NA
## H.npnct27.log H.npnct27.log NA
## H.npnct29.log H.npnct29.log NA
## H.npnct30.log H.npnct30.log NA
## H.P.http H.P.http NA
## S.npnct05.log S.npnct05.log NA
## S.npnct09.log S.npnct09.log NA
## S.npnct17.log S.npnct17.log NA
## S.npnct18.log S.npnct18.log NA
## S.npnct22.log S.npnct22.log NA
## S.npnct25.log S.npnct25.log NA
## S.npnct26.log S.npnct26.log NA
## S.npnct27.log S.npnct27.log NA
## S.npnct29.log S.npnct29.log NA
## S.npnct30.log S.npnct30.log NA
## S.P.http S.P.http NA
## A.npnct05.log A.npnct05.log NA
## A.npnct09.log A.npnct09.log NA
## A.npnct22.log A.npnct22.log NA
## A.npnct26.log A.npnct26.log NA
## A.npnct27.log A.npnct27.log NA
## A.npnct29.log A.npnct29.log NA
## A.npnct30.log A.npnct30.log NA
## PubDate.year.fctr PubDate.year.fctr NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000e+00
## A.nuppr.log 0 2.720962e-01
## S.nuppr.log 0 2.718459e-01
## WordCount.log 0 2.656836e-01
## A.ratio.sum.TfIdf.nwrds 0 2.623865e-01
## S.ratio.sum.TfIdf.nwrds 0 2.622549e-01
## WordCount 1 2.575265e-01
## S.nwrds.unq.log 0 2.461670e-01
## A.nwrds.unq.log 0 2.460117e-01
## H.ratio.sum.TfIdf.nwrds 0 2.254527e-01
## S.nchrs.log 0 2.246930e-01
## A.nchrs.log 0 2.245488e-01
## H.nwrds.unq.log 0 2.014127e-01
## A.nwrds.log 0 1.978712e-01
## S.nwrds.log 0 1.978341e-01
## .clusterid 1 1.820567e-01
## .clusterid.fctr 0 1.813987e-01
## H.nchrs.log 0 1.710624e-01
## H.nwrds.log 0 1.573431e-01
## H.sum.TfIdf 0 1.520414e-01
## S.sum.TfIdf 0 1.484963e-01
## A.sum.TfIdf 0 1.478461e-01
## PubDate.hour.fctr 0 1.354368e-01
## H.npnct19.log 0 1.283641e-01
## H.nuppr.log 0 1.278085e-01
## A.ndgts.log 0 1.249484e-01
## S.ndgts.log 0 1.242046e-01
## A.ratio.nstopwrds.nwrds 0 1.213545e-01
## S.ratio.nstopwrds.nwrds 0 1.206896e-01
## H.ndgts.log 0 1.196633e-01
## A.nstopwrds.log 0 1.153879e-01
## S.nstopwrds.log 0 1.148150e-01
## PubDate.wkend 0 1.067288e-01
## A.npnct11.log 0 9.183870e-02
## S.npnct11.log 0 9.158156e-02
## H.P.recap.colon 0 9.008096e-02
## H.npnct28.log 0 8.917338e-02
## H.P.quandary 0 8.734922e-02
## H.nstopwrds.log 0 8.657067e-02
## S.T.week 0 8.552704e-02
## A.T.week 0 8.542792e-02
## S.T.fashion 0 8.417159e-02
## A.T.fashion 0 8.416793e-02
## H.npnct15.log 0 8.273237e-02
## H.T.fashion 0 7.947505e-02
## H.P.year.colon 0 7.842875e-02
## H.P.fashion.week 0 7.632046e-02
## S.T.archiv 0 7.202808e-02
## A.T.archiv 0 7.202808e-02
## S.P.fashion.week 0 7.080716e-02
## A.P.fashion.week 0 7.080716e-02
## S.T.tribun 0 7.013418e-02
## A.T.tribun 0 7.013418e-02
## S.T.intern 0 6.956906e-02
## A.T.intern 0 6.953025e-02
## A.npnct15.log 0 6.893301e-02
## S.T.photo 0 6.874283e-02
## A.T.photo 0 6.873838e-02
## H.T.week 0 6.827601e-02
## S.npnct15.log 0 6.770952e-02
## S.T.herald 0 6.752419e-02
## A.T.herald 0 6.752419e-02
## H.T.X2015 0 6.601141e-02
## H.T.daili 0 6.303731e-02
## S.npnct04.log 0 6.294642e-02
## A.npnct04.log 0 6.294642e-02
## H.T.report 0 6.238114e-02
## S.T.diari 0 6.229931e-02
## A.T.diari 0 6.229931e-02
## H.npnct14.log 0 6.158577e-02
## H.P.no.comment.colon 0 6.074669e-02
## H.T.day 0 6.044381e-02
## H.T.springsumm 0 5.943248e-02
## H.T.today 0 5.833786e-02
## H.T.newyork 0 5.564999e-02
## S.npnct19.log 0 5.503894e-02
## A.npnct19.log 0 5.482747e-02
## S.T.articl 0 5.471737e-02
## A.T.articl 0 5.470831e-02
## H.P.facts.figures 0 5.410097e-02
## PubDate.last10 1 5.398093e-02
## H.npnct08.log 0 5.375262e-02
## S.npnct13.log 0 5.332519e-02
## H.npnct04.log 0 5.126277e-02
## S.T.share 0 5.105597e-02
## A.T.share 0 5.105597e-02
## A.npnct13.log 0 4.999563e-02
## S.T.newyorktim 0 4.985328e-02
## A.T.newyorktim 0 4.984782e-02
## PubDate.last10.log 0 4.931702e-02
## H.T.busi 0 4.899819e-02
## H.T.morn 0 4.838380e-02
## S.T.word 0 4.822452e-02
## A.T.word 0 4.821561e-02
## S.T.compani 0 4.787994e-02
## S.T.report 0 4.779877e-02
## A.T.compani 0 4.774812e-02
## A.T.report 0 4.774593e-02
## S.T.newyork 0 4.694998e-02
## A.T.newyork 0 4.686921e-02
## PubDate.last1.log 0 4.635751e-02
## H.T.X2014 0 4.523858e-02
## H.T.first 0 4.472902e-02
## S.T.first 0 4.447317e-02
## H.T.news 0 4.436368e-02
## A.T.first 0 4.433630e-02
## H.P.readers.respond 0 4.432886e-02
## H.T.clip 0 4.388279e-02
## H.P.daily.clip.report 0 4.388279e-02
## S.P.daily.clip.report 0 4.388279e-02
## A.P.daily.clip.report 0 4.388279e-02
## A.npnct28.log 0 4.373349e-02
## S.npnct28.log 0 4.370037e-02
## H.P.first.draft 0 4.316253e-02
## A.T.day 0 4.270831e-02
## S.T.day 0 4.262213e-02
## H.P.today.in.smallbusiness 0 4.243051e-02
## A.T.show 0 4.185292e-02
## S.T.show 0 4.182920e-02
## S.T.senat 0 4.143422e-02
## A.T.senat 0 4.139980e-02
## A.T.make 0 4.124187e-02
## S.T.make 0 4.118050e-02
## H.T.new 0 4.111696e-02
## H.ratio.nstopwrds.nwrds 0 4.024406e-02
## H.T.pictur 0 4.003882e-02
## PubDate.last100 1 3.989229e-02
## PubDate.wkday.fctr 0 3.980129e-02
## S.T.appear 0 3.941362e-02
## A.T.appear 0 3.941362e-02
## S.T.will 0 3.888838e-02
## A.T.will 0 3.884318e-02
## H.P.what.we.are 0 3.775209e-02
## A.npnct12.log 0 3.760012e-02
## S.T.year 0 3.756011e-02
## A.T.year 0 3.741571e-02
## H.P.today.in.politic 0 3.733661e-02
## S.npnct12.log 0 3.638891e-02
## PubDate.last1 1 3.592267e-02
## H.T.read 0 3.467043e-02
## PubDate.minute.fctr 0 3.407385e-02
## H.T.get 0 3.300192e-02
## H.T.art 0 3.291486e-02
## H.T.china 0 3.283653e-02
## H.P.verbatim.colon 0 3.194363e-02
## H.npnct06.log 0 3.190718e-02
## S.npnct01.log 0 3.093101e-02
## A.npnct01.log 0 3.093101e-02
## A.T.can 0 3.083389e-02
## H.T.polit 0 3.058564e-02
## H.npnct16.log 0 3.039622e-02
## S.T.can 0 3.005998e-02
## S.P.metropolitan.diary.colon 0 2.841404e-02
## A.P.metropolitan.diary.colon 0 2.841404e-02
## H.T.billion 0 2.776561e-02
## S.npnct21.log 0 2.760321e-02
## S.npnct23.log 0 2.760321e-02
## H.T.ebola 0 2.682920e-02
## A.T.new 0 2.597887e-02
## S.T.new 0 2.592872e-02
## H.T.deal 0 2.556237e-02
## H.npnct13.log 0 2.524770e-02
## H.T.newyorktim 0 2.514415e-02
## A.T.time 0 2.430509e-02
## S.T.time 0 2.416246e-02
## A.npnct14.log 0 2.407715e-02
## S.npnct06.log 0 2.389145e-02
## A.npnct06.log 0 2.389145e-02
## A.T.take 0 2.271897e-02
## H.npnct01.log 0 2.271577e-02
## S.T.take 0 2.264447e-02
## H.P.on.this.day 0 2.150663e-02
## S.P.first.draft 0 2.150663e-02
## A.P.first.draft 0 2.150663e-02
## S.npnct14.log 0 2.121844e-02
## H.T.test 0 2.117852e-02
## H.npnct02.log 0 2.001851e-02
## S.npnct20.log 0 1.923169e-02
## A.npnct20.log 0 1.923169e-02
## A.T.obama 0 1.914924e-02
## PubDate.month.fctr 1 1.914874e-02
## S.T.obama 0 1.914281e-02
## A.T.said 0 1.876762e-02
## S.T.said 0 1.863436e-02
## S.P.year.colon 0 1.755336e-02
## A.P.year.colon 0 1.755336e-02
## PubDate.POSIX 1 1.568326e-02
## PubDate.zoo 1 1.568326e-02
## A.npnct21.log 0 1.537569e-02
## A.npnct23.log 0 1.537569e-02
## A.npnct17.log 0 1.457558e-02
## A.npnct02.log 0 1.451467e-02
## H.T.big 0 1.438162e-02
## H.T.word 0 1.382927e-02
## A.npnct03.log 0 1.359260e-02
## H.T.make 0 1.349595e-02
## H.npnct11.log 0 1.333613e-02
## H.npnct12.log 0 1.305305e-02
## A.P.http 0 1.294748e-02
## A.npnct18.log 0 1.271661e-02
## S.npnct03.log 0 1.240734e-02
## myCategory.fctr 0 1.234541e-02
## S.npnct07.log 0 1.214357e-02
## A.npnct07.log 0 1.214357e-02
## H.npnct07.log 0 1.201741e-02
## PubDate.second.fctr 0 1.187946e-02
## UniqueID 1 1.182492e-02
## PubDate.date.fctr 0 1.164756e-02
## A.T.one 0 1.051414e-02
## S.T.one 0 1.050293e-02
## H.T.bank 0 1.037439e-02
## H.T.obama 0 9.878461e-03
## H.T.say 0 9.763205e-03
## H.P.friday.night.music 0 9.653967e-03
## H.npnct05.log 0 9.653967e-03
## H.npnct03.log 0 9.533020e-03
## .rnorm 0 8.244230e-03
## H.P.s.notebook 0 7.755542e-03
## PubDate.last100.log 0 7.663322e-03
## H.npnct10.log 0 5.547032e-03
## H.npnct20.log 0 5.547032e-03
## S.npnct02.log 0 5.547032e-03
## S.npnct10.log 0 5.547032e-03
## A.npnct10.log 0 5.547032e-03
## A.npnct25.log 0 5.547032e-03
## A.npnct08.log 0 4.193476e-03
## S.npnct08.log 0 3.372706e-03
## S.T.presid 0 2.381159e-03
## A.T.presid 0 2.090565e-03
## S.npnct16.log 0 1.587454e-03
## A.npnct16.log 0 1.587454e-03
## H.T.take 0 1.263270e-03
## H.npnct24.log 0 9.890046e-19
## S.npnct24.log 0 9.890046e-19
## A.npnct24.log 0 9.890046e-19
## H.npnct09.log 0 NA
## H.npnct17.log 0 NA
## H.npnct18.log 0 NA
## H.npnct21.log 0 NA
## H.npnct22.log 0 NA
## H.npnct23.log 0 NA
## H.npnct25.log 0 NA
## H.npnct26.log 0 NA
## H.npnct27.log 0 NA
## H.npnct29.log 0 NA
## H.npnct30.log 0 NA
## H.P.http 0 NA
## S.npnct05.log 0 NA
## S.npnct09.log 0 NA
## S.npnct17.log 0 NA
## S.npnct18.log 0 NA
## S.npnct22.log 0 NA
## S.npnct25.log 0 NA
## S.npnct26.log 0 NA
## S.npnct27.log 0 NA
## S.npnct29.log 0 NA
## S.npnct30.log 0 NA
## S.P.http 0 NA
## A.npnct05.log 0 NA
## A.npnct09.log 0 NA
## A.npnct22.log 0 NA
## A.npnct26.log 0 NA
## A.npnct27.log 0 NA
## A.npnct29.log 0 NA
## A.npnct30.log 0 NA
## PubDate.year.fctr 0 NA
# sav_feats_df <- glb_feats_df
print(glb_feats_df <- orderBy(~-cor.y,
myfind_cor_features(feats_df=glb_feats_df, entity_df=glb_trnobs_df,
rsp_var=glb_rsp_var)))
## Loading required package: caret
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
## [1] "cor(A.npnct01.log, S.npnct01.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct01.log)=0.0309"
## [1] "cor(Popular.fctr, S.npnct01.log)=0.0309"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct01.log as highly correlated with
## A.npnct01.log
## [1] "cor(A.npnct04.log, S.npnct04.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct04.log)=-0.0629"
## [1] "cor(Popular.fctr, S.npnct04.log)=-0.0629"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct04.log as highly correlated with
## A.npnct04.log
## [1] "cor(A.npnct06.log, S.npnct06.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct06.log)=-0.0239"
## [1] "cor(Popular.fctr, S.npnct06.log)=-0.0239"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct06.log as highly correlated with
## A.npnct06.log
## [1] "cor(A.npnct07.log, S.npnct07.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct07.log)=-0.0121"
## [1] "cor(Popular.fctr, S.npnct07.log)=-0.0121"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct07.log as highly correlated with
## A.npnct07.log
## [1] "cor(A.npnct20.log, S.npnct20.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct20.log)=-0.0192"
## [1] "cor(Popular.fctr, S.npnct20.log)=-0.0192"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct20.log as highly correlated with
## A.npnct20.log
## [1] "cor(A.npnct21.log, A.npnct23.log)=1.0000"
## [1] "cor(Popular.fctr, A.npnct21.log)=0.0154"
## [1] "cor(Popular.fctr, A.npnct23.log)=0.0154"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct23.log as highly correlated with
## A.npnct21.log
## [1] "cor(A.P.daily.clip.report, H.P.daily.clip.report)=1.0000"
## [1] "cor(Popular.fctr, A.P.daily.clip.report)=-0.0439"
## [1] "cor(Popular.fctr, H.P.daily.clip.report)=-0.0439"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.P.daily.clip.report as highly correlated with
## A.P.daily.clip.report
## [1] "cor(A.P.daily.clip.report, S.P.daily.clip.report)=1.0000"
## [1] "cor(Popular.fctr, A.P.daily.clip.report)=-0.0439"
## [1] "cor(Popular.fctr, S.P.daily.clip.report)=-0.0439"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.P.daily.clip.report as highly correlated with
## A.P.daily.clip.report
## [1] "cor(A.P.fashion.week, S.P.fashion.week)=1.0000"
## [1] "cor(Popular.fctr, A.P.fashion.week)=-0.0708"
## [1] "cor(Popular.fctr, S.P.fashion.week)=-0.0708"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.P.fashion.week as highly correlated with
## A.P.fashion.week
## [1] "cor(A.P.first.draft, S.P.first.draft)=1.0000"
## [1] "cor(Popular.fctr, A.P.first.draft)=-0.0215"
## [1] "cor(Popular.fctr, S.P.first.draft)=-0.0215"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.P.first.draft as highly correlated with
## A.P.first.draft
## [1] "cor(A.P.metropolitan.diary.colon, S.P.metropolitan.diary.colon)=1.0000"
## [1] "cor(Popular.fctr, A.P.metropolitan.diary.colon)=-0.0284"
## [1] "cor(Popular.fctr, S.P.metropolitan.diary.colon)=-0.0284"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.P.metropolitan.diary.colon as highly
## correlated with A.P.metropolitan.diary.colon
## [1] "cor(A.P.year.colon, S.P.year.colon)=1.0000"
## [1] "cor(Popular.fctr, A.P.year.colon)=-0.0176"
## [1] "cor(Popular.fctr, S.P.year.colon)=-0.0176"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.P.year.colon as highly correlated with
## A.P.year.colon
## [1] "cor(A.T.appear, S.T.appear)=1.0000"
## [1] "cor(Popular.fctr, A.T.appear)=-0.0394"
## [1] "cor(Popular.fctr, S.T.appear)=-0.0394"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.appear as highly correlated with A.T.appear
## [1] "cor(A.T.archiv, S.T.archiv)=1.0000"
## [1] "cor(Popular.fctr, A.T.archiv)=-0.0720"
## [1] "cor(Popular.fctr, S.T.archiv)=-0.0720"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.archiv as highly correlated with A.T.archiv
## [1] "cor(A.T.diari, S.T.diari)=1.0000"
## [1] "cor(Popular.fctr, A.T.diari)=-0.0623"
## [1] "cor(Popular.fctr, S.T.diari)=-0.0623"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.diari as highly correlated with A.T.diari
## [1] "cor(A.T.herald, S.T.herald)=1.0000"
## [1] "cor(Popular.fctr, A.T.herald)=-0.0675"
## [1] "cor(Popular.fctr, S.T.herald)=-0.0675"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.herald as highly correlated with A.T.herald
## [1] "cor(A.T.share, S.T.share)=1.0000"
## [1] "cor(Popular.fctr, A.T.share)=-0.0511"
## [1] "cor(Popular.fctr, S.T.share)=-0.0511"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.share as highly correlated with A.T.share
## [1] "cor(A.T.tribun, S.T.tribun)=1.0000"
## [1] "cor(Popular.fctr, A.T.tribun)=-0.0701"
## [1] "cor(Popular.fctr, S.T.tribun)=-0.0701"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.tribun as highly correlated with A.T.tribun
## [1] "cor(S.npnct21.log, S.npnct23.log)=1.0000"
## [1] "cor(Popular.fctr, S.npnct21.log)=0.0276"
## [1] "cor(Popular.fctr, S.npnct23.log)=0.0276"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct23.log as highly correlated with
## S.npnct21.log
## [1] "cor(A.P.daily.clip.report, H.T.clip)=1.0000"
## [1] "cor(Popular.fctr, A.P.daily.clip.report)=-0.0439"
## [1] "cor(Popular.fctr, H.T.clip)=-0.0439"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.T.clip as highly correlated with
## A.P.daily.clip.report
## [1] "cor(A.T.fashion, S.T.fashion)=1.0000"
## [1] "cor(Popular.fctr, A.T.fashion)=-0.0842"
## [1] "cor(Popular.fctr, S.T.fashion)=-0.0842"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.T.fashion as highly correlated with
## S.T.fashion
## [1] "cor(A.T.photo, S.T.photo)=1.0000"
## [1] "cor(Popular.fctr, A.T.photo)=-0.0687"
## [1] "cor(Popular.fctr, S.T.photo)=-0.0687"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.photo as highly correlated with S.T.photo
## [1] "cor(A.T.newyorktim, S.T.newyorktim)=1.0000"
## [1] "cor(Popular.fctr, A.T.newyorktim)=-0.0498"
## [1] "cor(Popular.fctr, S.T.newyorktim)=-0.0499"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.newyorktim as highly correlated with
## S.T.newyorktim
## [1] "cor(A.T.word, S.T.word)=1.0000"
## [1] "cor(Popular.fctr, A.T.word)=-0.0482"
## [1] "cor(Popular.fctr, S.T.word)=-0.0482"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.word as highly correlated with S.T.word
## [1] "cor(A.T.articl, S.T.articl)=1.0000"
## [1] "cor(Popular.fctr, A.T.articl)=-0.0547"
## [1] "cor(Popular.fctr, S.T.articl)=-0.0547"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.articl as highly correlated with S.T.articl
## [1] "cor(A.T.show, S.T.show)=1.0000"
## [1] "cor(Popular.fctr, A.T.show)=-0.0419"
## [1] "cor(Popular.fctr, S.T.show)=-0.0418"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.show as highly correlated with A.T.show
## [1] "cor(A.T.make, S.T.make)=1.0000"
## [1] "cor(Popular.fctr, A.T.make)=0.0412"
## [1] "cor(Popular.fctr, S.T.make)=0.0412"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.make as highly correlated with A.T.make
## [1] "cor(A.T.intern, S.T.intern)=1.0000"
## [1] "cor(Popular.fctr, A.T.intern)=-0.0695"
## [1] "cor(Popular.fctr, S.T.intern)=-0.0696"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.intern as highly correlated with S.T.intern
## [1] "cor(A.T.report, S.T.report)=1.0000"
## [1] "cor(Popular.fctr, A.T.report)=-0.0477"
## [1] "cor(Popular.fctr, S.T.report)=-0.0478"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.report as highly correlated with S.T.report
## [1] "cor(A.T.week, S.T.week)=1.0000"
## [1] "cor(Popular.fctr, A.T.week)=-0.0854"
## [1] "cor(Popular.fctr, S.T.week)=-0.0855"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.week as highly correlated with S.T.week
## [1] "cor(A.T.newyork, S.T.newyork)=0.9999"
## [1] "cor(Popular.fctr, A.T.newyork)=-0.0469"
## [1] "cor(Popular.fctr, S.T.newyork)=-0.0469"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.T.newyork as highly correlated with
## S.T.newyork
## [1] "cor(A.T.said, S.T.said)=0.9999"
## [1] "cor(Popular.fctr, A.T.said)=0.0188"
## [1] "cor(Popular.fctr, S.T.said)=0.0186"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.said as highly correlated with A.T.said
## [1] "cor(A.T.year, S.T.year)=0.9999"
## [1] "cor(Popular.fctr, A.T.year)=-0.0374"
## [1] "cor(Popular.fctr, S.T.year)=-0.0376"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.year as highly correlated with S.T.year
## [1] "cor(A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds)=0.9999"
## [1] "cor(Popular.fctr, A.ratio.sum.TfIdf.nwrds)=0.2624"
## [1] "cor(Popular.fctr, S.ratio.sum.TfIdf.nwrds)=0.2623"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.ratio.sum.TfIdf.nwrds as highly correlated
## with A.ratio.sum.TfIdf.nwrds
## [1] "cor(A.T.compani, S.T.compani)=0.9999"
## [1] "cor(Popular.fctr, A.T.compani)=-0.0477"
## [1] "cor(Popular.fctr, S.T.compani)=-0.0479"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.T.compani as highly correlated with
## S.T.compani
## [1] "cor(A.T.first, S.T.first)=0.9999"
## [1] "cor(Popular.fctr, A.T.first)=-0.0443"
## [1] "cor(Popular.fctr, S.T.first)=-0.0445"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.first as highly correlated with S.T.first
## [1] "cor(A.T.new, S.T.new)=0.9998"
## [1] "cor(Popular.fctr, A.T.new)=-0.0260"
## [1] "cor(Popular.fctr, S.T.new)=-0.0259"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.new as highly correlated with A.T.new
## [1] "cor(A.npnct11.log, S.npnct11.log)=0.9997"
## [1] "cor(Popular.fctr, A.npnct11.log)=-0.0918"
## [1] "cor(Popular.fctr, S.npnct11.log)=-0.0916"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct11.log as highly correlated with
## A.npnct11.log
## [1] "cor(A.T.one, S.T.one)=0.9997"
## [1] "cor(Popular.fctr, A.T.one)=0.0105"
## [1] "cor(Popular.fctr, S.T.one)=0.0105"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.one as highly correlated with A.T.one
## [1] "cor(A.T.take, S.T.take)=0.9997"
## [1] "cor(Popular.fctr, A.T.take)=-0.0227"
## [1] "cor(Popular.fctr, S.T.take)=-0.0226"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.take as highly correlated with A.T.take
## [1] "cor(A.sum.TfIdf, S.sum.TfIdf)=0.9997"
## [1] "cor(Popular.fctr, A.sum.TfIdf)=0.1478"
## [1] "cor(Popular.fctr, S.sum.TfIdf)=0.1485"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.sum.TfIdf as highly correlated with
## S.sum.TfIdf
## [1] "cor(A.T.can, S.T.can)=0.9996"
## [1] "cor(Popular.fctr, A.T.can)=0.0308"
## [1] "cor(Popular.fctr, S.T.can)=0.0301"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.can as highly correlated with A.T.can
## [1] "cor(A.T.will, S.T.will)=0.9996"
## [1] "cor(Popular.fctr, A.T.will)=-0.0388"
## [1] "cor(Popular.fctr, S.T.will)=-0.0389"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.will as highly correlated with S.T.will
## [1] "cor(A.T.day, S.T.day)=0.9996"
## [1] "cor(Popular.fctr, A.T.day)=-0.0427"
## [1] "cor(Popular.fctr, S.T.day)=-0.0426"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.day as highly correlated with A.T.day
## [1] "cor(A.T.time, S.T.time)=0.9996"
## [1] "cor(Popular.fctr, A.T.time)=-0.0243"
## [1] "cor(Popular.fctr, S.T.time)=-0.0242"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.time as highly correlated with A.T.time
## [1] "cor(A.T.senat, S.T.senat)=0.9996"
## [1] "cor(Popular.fctr, A.T.senat)=-0.0414"
## [1] "cor(Popular.fctr, S.T.senat)=-0.0414"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.senat as highly correlated with S.T.senat
## [1] "cor(A.T.obama, S.T.obama)=0.9995"
## [1] "cor(Popular.fctr, A.T.obama)=-0.0191"
## [1] "cor(Popular.fctr, S.T.obama)=-0.0191"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.obama as highly correlated with A.T.obama
## [1] "cor(A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds)=0.9994"
## [1] "cor(Popular.fctr, A.ratio.nstopwrds.nwrds)=0.1214"
## [1] "cor(Popular.fctr, S.ratio.nstopwrds.nwrds)=0.1207"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.ratio.nstopwrds.nwrds as highly correlated
## with A.ratio.nstopwrds.nwrds
## [1] "cor(A.nuppr.log, S.nuppr.log)=0.9991"
## [1] "cor(Popular.fctr, A.nuppr.log)=-0.2721"
## [1] "cor(Popular.fctr, S.nuppr.log)=-0.2718"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified S.nuppr.log as highly correlated with
## A.nuppr.log
## [1] "cor(A.nwrds.unq.log, S.nwrds.unq.log)=0.9989"
## [1] "cor(Popular.fctr, A.nwrds.unq.log)=-0.2460"
## [1] "cor(Popular.fctr, S.nwrds.unq.log)=-0.2462"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.nwrds.unq.log as highly correlated with
## S.nwrds.unq.log
## [1] "cor(A.npnct28.log, S.npnct28.log)=0.9989"
## [1] "cor(Popular.fctr, A.npnct28.log)=-0.0437"
## [1] "cor(Popular.fctr, S.npnct28.log)=-0.0437"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct28.log as highly correlated with
## A.npnct28.log
## [1] "cor(A.nstopwrds.log, S.nstopwrds.log)=0.9989"
## [1] "cor(Popular.fctr, A.nstopwrds.log)=-0.1154"
## [1] "cor(Popular.fctr, S.nstopwrds.log)=-0.1148"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.nstopwrds.log as highly correlated with
## A.nstopwrds.log
## [1] "cor(A.nchrs.log, S.nchrs.log)=0.9986"
## [1] "cor(Popular.fctr, A.nchrs.log)=-0.2245"
## [1] "cor(Popular.fctr, S.nchrs.log)=-0.2247"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.nchrs.log as highly correlated with
## S.nchrs.log
## [1] "cor(A.nwrds.log, S.nwrds.log)=0.9985"
## [1] "cor(Popular.fctr, A.nwrds.log)=-0.1979"
## [1] "cor(Popular.fctr, S.nwrds.log)=-0.1978"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified S.nwrds.log as highly correlated with
## A.nwrds.log
## [1] "cor(A.npnct19.log, S.npnct19.log)=0.9957"
## [1] "cor(Popular.fctr, A.npnct19.log)=0.0548"
## [1] "cor(Popular.fctr, S.npnct19.log)=0.0550"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct19.log as highly correlated with
## S.npnct19.log
## [1] "cor(A.ndgts.log, S.ndgts.log)=0.9955"
## [1] "cor(Popular.fctr, A.ndgts.log)=-0.1249"
## [1] "cor(Popular.fctr, S.ndgts.log)=-0.1242"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified S.ndgts.log as highly correlated with
## A.ndgts.log
## [1] "cor(A.npnct12.log, S.npnct12.log)=0.9935"
## [1] "cor(Popular.fctr, A.npnct12.log)=-0.0376"
## [1] "cor(Popular.fctr, S.npnct12.log)=-0.0364"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct12.log as highly correlated with
## A.npnct12.log
## [1] "cor(A.T.herald, A.T.tribun)=0.9935"
## [1] "cor(Popular.fctr, A.T.herald)=-0.0675"
## [1] "cor(Popular.fctr, A.T.tribun)=-0.0701"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.herald as highly correlated with A.T.tribun
## [1] "cor(A.npnct15.log, S.npnct15.log)=0.9917"
## [1] "cor(Popular.fctr, A.npnct15.log)=-0.0689"
## [1] "cor(Popular.fctr, S.npnct15.log)=-0.0677"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct15.log as highly correlated with
## A.npnct15.log
## [1] "cor(A.npnct18.log, A.P.http)=0.9882"
## [1] "cor(Popular.fctr, A.npnct18.log)=-0.0127"
## [1] "cor(Popular.fctr, A.P.http)=-0.0129"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct18.log as highly correlated with
## A.P.http
## [1] "cor(A.npnct13.log, S.npnct13.log)=0.9795"
## [1] "cor(Popular.fctr, A.npnct13.log)=-0.0500"
## [1] "cor(Popular.fctr, S.npnct13.log)=-0.0533"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct13.log as highly correlated with
## S.npnct13.log
## [1] "cor(A.nwrds.log, S.nchrs.log)=0.9717"
## [1] "cor(Popular.fctr, A.nwrds.log)=-0.1979"
## [1] "cor(Popular.fctr, S.nchrs.log)=-0.2247"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.nwrds.log as highly correlated with
## S.nchrs.log
## [1] "cor(A.T.archiv, A.T.tribun)=0.9663"
## [1] "cor(Popular.fctr, A.T.archiv)=-0.0720"
## [1] "cor(Popular.fctr, A.T.tribun)=-0.0701"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.tribun as highly correlated with A.T.archiv
## [1] "cor(H.npnct14.log, H.T.springsumm)=0.9650"
## [1] "cor(Popular.fctr, H.npnct14.log)=-0.0616"
## [1] "cor(Popular.fctr, H.T.springsumm)=-0.0594"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.T.springsumm as highly correlated with
## H.npnct14.log
## [1] "cor(S.nchrs.log, S.nwrds.unq.log)=0.9537"
## [1] "cor(Popular.fctr, S.nchrs.log)=-0.2247"
## [1] "cor(Popular.fctr, S.nwrds.unq.log)=-0.2462"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified S.nchrs.log as highly correlated with
## S.nwrds.unq.log
## [1] "cor(A.npnct02.log, A.P.http)=0.9261"
## [1] "cor(Popular.fctr, A.npnct02.log)=-0.0145"
## [1] "cor(Popular.fctr, A.P.http)=-0.0129"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.P.http as highly correlated with
## A.npnct02.log
## [1] "cor(H.nchrs.log, H.nwrds.log)=0.9212"
## [1] "cor(Popular.fctr, H.nchrs.log)=-0.1711"
## [1] "cor(Popular.fctr, H.nwrds.log)=-0.1573"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.nwrds.log as highly correlated with
## H.nchrs.log
## [1] "cor(A.npnct03.log, S.npnct03.log)=0.9128"
## [1] "cor(Popular.fctr, A.npnct03.log)=-0.0136"
## [1] "cor(Popular.fctr, S.npnct03.log)=-0.0124"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.npnct03.log as highly correlated with
## A.npnct03.log
## [1] "cor(H.nchrs.log, H.nwrds.unq.log)=0.8882"
## [1] "cor(Popular.fctr, H.nchrs.log)=-0.1711"
## [1] "cor(Popular.fctr, H.nwrds.unq.log)=-0.2014"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.nchrs.log as highly correlated with
## H.nwrds.unq.log
## [1] "cor(A.T.archiv, S.T.intern)=0.8869"
## [1] "cor(Popular.fctr, A.T.archiv)=-0.0720"
## [1] "cor(Popular.fctr, S.T.intern)=-0.0696"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.T.intern as highly correlated with A.T.archiv
## [1] "cor(A.P.daily.clip.report, H.T.daili)=0.8858"
## [1] "cor(Popular.fctr, A.P.daily.clip.report)=-0.0439"
## [1] "cor(Popular.fctr, H.T.daili)=-0.0630"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.P.daily.clip.report as highly correlated with
## H.T.daili
## [1] "cor(H.npnct14.log, H.T.X2015)=0.8848"
## [1] "cor(Popular.fctr, H.npnct14.log)=-0.0616"
## [1] "cor(Popular.fctr, H.T.X2015)=-0.0660"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.npnct14.log as highly correlated with
## H.T.X2015
## [1] "cor(A.npnct02.log, A.npnct17.log)=0.8745"
## [1] "cor(Popular.fctr, A.npnct02.log)=-0.0145"
## [1] "cor(Popular.fctr, A.npnct17.log)=-0.0146"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct02.log as highly correlated with
## A.npnct17.log
## [1] "cor(H.P.today.in.politic, H.T.polit)=0.8713"
## [1] "cor(Popular.fctr, H.P.today.in.politic)=-0.0373"
## [1] "cor(Popular.fctr, H.T.polit)=-0.0306"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.T.polit as highly correlated with
## H.P.today.in.politic
## [1] "cor(A.T.archiv, H.P.year.colon)=0.8509"
## [1] "cor(Popular.fctr, A.T.archiv)=-0.0720"
## [1] "cor(Popular.fctr, H.P.year.colon)=-0.0784"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified A.T.archiv as highly correlated with
## H.P.year.colon
## [1] "cor(H.P.what.we.are, H.T.read)=0.8479"
## [1] "cor(Popular.fctr, H.P.what.we.are)=-0.0378"
## [1] "cor(Popular.fctr, H.T.read)=-0.0347"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.T.read as highly correlated with
## H.P.what.we.are
## [1] "cor(H.P.fashion.week, H.T.fashion)=0.8336"
## [1] "cor(Popular.fctr, H.P.fashion.week)=-0.0763"
## [1] "cor(Popular.fctr, H.T.fashion)=-0.0795"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.P.fashion.week as highly correlated with
## H.T.fashion
## [1] "cor(A.npnct28.log, H.T.morn)=0.8327"
## [1] "cor(Popular.fctr, A.npnct28.log)=-0.0437"
## [1] "cor(Popular.fctr, H.T.morn)=-0.0484"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct28.log as highly correlated with
## H.T.morn
## [1] "cor(H.nuppr.log, H.nwrds.unq.log)=0.8295"
## [1] "cor(Popular.fctr, H.nuppr.log)=-0.1278"
## [1] "cor(Popular.fctr, H.nwrds.unq.log)=-0.2014"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.nuppr.log as highly correlated with
## H.nwrds.unq.log
## [1] "cor(H.T.daili, H.T.report)=0.8248"
## [1] "cor(Popular.fctr, H.T.daili)=-0.0630"
## [1] "cor(Popular.fctr, H.T.report)=-0.0624"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.T.report as highly correlated with H.T.daili
## [1] "cor(H.npnct06.log, H.npnct16.log)=0.8106"
## [1] "cor(Popular.fctr, H.npnct06.log)=0.0319"
## [1] "cor(Popular.fctr, H.npnct16.log)=0.0304"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.npnct16.log as highly correlated with
## H.npnct06.log
## [1] "cor(H.P.today.in.politic, H.T.today)=0.8042"
## [1] "cor(Popular.fctr, H.P.today.in.politic)=-0.0373"
## [1] "cor(Popular.fctr, H.T.today)=-0.0583"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.P.today.in.politic as highly correlated with
## H.T.today
## [1] "cor(A.P.fashion.week, S.T.fashion)=0.8031"
## [1] "cor(Popular.fctr, A.P.fashion.week)=-0.0708"
## [1] "cor(Popular.fctr, S.T.fashion)=-0.0842"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.P.fashion.week as highly correlated with
## S.T.fashion
## [1] "cor(A.T.appear, H.T.word)=0.7960"
## [1] "cor(Popular.fctr, A.T.appear)=-0.0394"
## [1] "cor(Popular.fctr, H.T.word)=-0.0138"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.T.word as highly correlated with A.T.appear
## [1] "cor(A.ratio.sum.TfIdf.nwrds, S.nwrds.unq.log)=-0.7701"
## [1] "cor(Popular.fctr, A.ratio.sum.TfIdf.nwrds)=0.2624"
## [1] "cor(Popular.fctr, S.nwrds.unq.log)=-0.2462"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified S.nwrds.unq.log as highly correlated with
## A.ratio.sum.TfIdf.nwrds
## [1] "cor(A.npnct14.log, A.npnct17.log)=0.7663"
## [1] "cor(Popular.fctr, A.npnct14.log)=-0.0241"
## [1] "cor(Popular.fctr, A.npnct17.log)=-0.0146"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct17.log as highly correlated with
## A.npnct14.log
## [1] "cor(A.T.diari, H.T.X2015)=0.7526"
## [1] "cor(Popular.fctr, A.T.diari)=-0.0623"
## [1] "cor(Popular.fctr, H.T.X2015)=-0.0660"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.T.diari as highly correlated with H.T.X2015
## [1] "cor(A.npnct21.log, S.npnct21.log)=0.7461"
## [1] "cor(Popular.fctr, A.npnct21.log)=0.0154"
## [1] "cor(Popular.fctr, S.npnct21.log)=0.0276"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.npnct21.log as highly correlated with
## S.npnct21.log
## [1] "cor(H.T.X2015, S.T.fashion)=0.7294"
## [1] "cor(Popular.fctr, H.T.X2015)=-0.0660"
## [1] "cor(Popular.fctr, S.T.fashion)=-0.0842"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.T.X2015 as highly correlated with S.T.fashion
## [1] "cor(H.P.first.draft, H.T.first)=0.7289"
## [1] "cor(Popular.fctr, H.P.first.draft)=-0.0432"
## [1] "cor(Popular.fctr, H.T.first)=-0.0447"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified H.P.first.draft as highly correlated with
## H.T.first
## [1] "cor(A.nstopwrds.log, A.ratio.sum.TfIdf.nwrds)=-0.7148"
## [1] "cor(Popular.fctr, A.nstopwrds.log)=-0.1154"
## [1] "cor(Popular.fctr, A.ratio.sum.TfIdf.nwrds)=0.2624"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnobs_df, : Identified A.nstopwrds.log as highly correlated with
## A.ratio.sum.TfIdf.nwrds
## [1] "cor(H.npnct04.log, H.T.billion)=0.7051"
## [1] "cor(Popular.fctr, H.npnct04.log)=-0.0513"
## [1] "cor(Popular.fctr, H.T.billion)=-0.0278"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df
## = glb_trnobs_df, : Identified H.T.billion as highly correlated with
## H.npnct04.log
## id cor.y
## Popular Popular 1.000000e+00
## WordCount.log WordCount.log 2.656836e-01
## A.ratio.sum.TfIdf.nwrds A.ratio.sum.TfIdf.nwrds 2.623865e-01
## S.ratio.sum.TfIdf.nwrds S.ratio.sum.TfIdf.nwrds 2.622549e-01
## WordCount WordCount 2.575265e-01
## H.ratio.sum.TfIdf.nwrds H.ratio.sum.TfIdf.nwrds 2.254527e-01
## .clusterid .clusterid 1.820567e-01
## .clusterid.fctr .clusterid.fctr 1.813987e-01
## H.sum.TfIdf H.sum.TfIdf 1.520414e-01
## S.sum.TfIdf S.sum.TfIdf 1.484963e-01
## A.sum.TfIdf A.sum.TfIdf 1.478461e-01
## PubDate.hour.fctr PubDate.hour.fctr 1.354368e-01
## H.npnct19.log H.npnct19.log 1.283641e-01
## A.ratio.nstopwrds.nwrds A.ratio.nstopwrds.nwrds 1.213545e-01
## S.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.206896e-01
## PubDate.wkend PubDate.wkend 1.067288e-01
## H.P.recap.colon H.P.recap.colon 9.008096e-02
## H.P.quandary H.P.quandary 8.734922e-02
## H.P.no.comment.colon H.P.no.comment.colon 6.074669e-02
## S.npnct19.log S.npnct19.log 5.503894e-02
## A.npnct19.log A.npnct19.log 5.482747e-02
## H.P.facts.figures H.P.facts.figures 5.410097e-02
## PubDate.last10 PubDate.last10 5.398093e-02
## H.npnct08.log H.npnct08.log 5.375262e-02
## PubDate.last10.log PubDate.last10.log 4.931702e-02
## PubDate.last1.log PubDate.last1.log 4.635751e-02
## H.P.readers.respond H.P.readers.respond 4.432886e-02
## A.T.make A.T.make 4.124187e-02
## S.T.make S.T.make 4.118050e-02
## H.ratio.nstopwrds.nwrds H.ratio.nstopwrds.nwrds 4.024406e-02
## PubDate.last100 PubDate.last100 3.989229e-02
## PubDate.last1 PubDate.last1 3.592267e-02
## H.T.get H.T.get 3.300192e-02
## H.npnct06.log H.npnct06.log 3.190718e-02
## A.npnct01.log A.npnct01.log 3.093101e-02
## S.npnct01.log S.npnct01.log 3.093101e-02
## A.T.can A.T.can 3.083389e-02
## H.npnct16.log H.npnct16.log 3.039622e-02
## S.T.can S.T.can 3.005998e-02
## S.npnct21.log S.npnct21.log 2.760321e-02
## S.npnct23.log S.npnct23.log 2.760321e-02
## H.T.ebola H.T.ebola 2.682920e-02
## H.npnct01.log H.npnct01.log 2.271577e-02
## PubDate.month.fctr PubDate.month.fctr 1.914874e-02
## A.T.said A.T.said 1.876762e-02
## S.T.said S.T.said 1.863436e-02
## PubDate.POSIX PubDate.POSIX 1.568326e-02
## PubDate.zoo PubDate.zoo 1.568326e-02
## A.npnct21.log A.npnct21.log 1.537569e-02
## A.npnct23.log A.npnct23.log 1.537569e-02
## H.T.make H.T.make 1.349595e-02
## H.npnct11.log H.npnct11.log 1.333613e-02
## myCategory.fctr myCategory.fctr 1.234541e-02
## UniqueID UniqueID 1.182492e-02
## A.T.one A.T.one 1.051414e-02
## S.T.one S.T.one 1.050293e-02
## H.npnct03.log H.npnct03.log 9.533020e-03
## H.P.s.notebook H.P.s.notebook 7.755542e-03
## A.npnct24.log A.npnct24.log -9.890046e-19
## H.npnct24.log H.npnct24.log -9.890046e-19
## S.npnct24.log S.npnct24.log -9.890046e-19
## H.T.take H.T.take -1.263270e-03
## A.npnct16.log A.npnct16.log -1.587454e-03
## S.npnct16.log S.npnct16.log -1.587454e-03
## A.T.presid A.T.presid -2.090565e-03
## S.T.presid S.T.presid -2.381159e-03
## S.npnct08.log S.npnct08.log -3.372706e-03
## A.npnct08.log A.npnct08.log -4.193476e-03
## A.npnct25.log A.npnct25.log -5.547032e-03
## A.npnct10.log A.npnct10.log -5.547032e-03
## H.npnct10.log H.npnct10.log -5.547032e-03
## H.npnct20.log H.npnct20.log -5.547032e-03
## S.npnct02.log S.npnct02.log -5.547032e-03
## S.npnct10.log S.npnct10.log -5.547032e-03
## PubDate.last100.log PubDate.last100.log -7.663322e-03
## .rnorm .rnorm -8.244230e-03
## H.npnct05.log H.npnct05.log -9.653967e-03
## H.P.friday.night.music H.P.friday.night.music -9.653967e-03
## H.T.say H.T.say -9.763205e-03
## H.T.obama H.T.obama -9.878461e-03
## H.T.bank H.T.bank -1.037439e-02
## PubDate.date.fctr PubDate.date.fctr -1.164756e-02
## PubDate.second.fctr PubDate.second.fctr -1.187946e-02
## H.npnct07.log H.npnct07.log -1.201741e-02
## A.npnct07.log A.npnct07.log -1.214357e-02
## S.npnct07.log S.npnct07.log -1.214357e-02
## S.npnct03.log S.npnct03.log -1.240734e-02
## A.npnct18.log A.npnct18.log -1.271661e-02
## A.P.http A.P.http -1.294748e-02
## H.npnct12.log H.npnct12.log -1.305305e-02
## A.npnct03.log A.npnct03.log -1.359260e-02
## H.T.word H.T.word -1.382927e-02
## H.T.big H.T.big -1.438162e-02
## A.npnct02.log A.npnct02.log -1.451467e-02
## A.npnct17.log A.npnct17.log -1.457558e-02
## A.P.year.colon A.P.year.colon -1.755336e-02
## S.P.year.colon S.P.year.colon -1.755336e-02
## S.T.obama S.T.obama -1.914281e-02
## A.T.obama A.T.obama -1.914924e-02
## A.npnct20.log A.npnct20.log -1.923169e-02
## S.npnct20.log S.npnct20.log -1.923169e-02
## H.npnct02.log H.npnct02.log -2.001851e-02
## H.T.test H.T.test -2.117852e-02
## S.npnct14.log S.npnct14.log -2.121844e-02
## A.P.first.draft A.P.first.draft -2.150663e-02
## H.P.on.this.day H.P.on.this.day -2.150663e-02
## S.P.first.draft S.P.first.draft -2.150663e-02
## S.T.take S.T.take -2.264447e-02
## A.T.take A.T.take -2.271897e-02
## A.npnct06.log A.npnct06.log -2.389145e-02
## S.npnct06.log S.npnct06.log -2.389145e-02
## A.npnct14.log A.npnct14.log -2.407715e-02
## S.T.time S.T.time -2.416246e-02
## A.T.time A.T.time -2.430509e-02
## H.T.newyorktim H.T.newyorktim -2.514415e-02
## H.npnct13.log H.npnct13.log -2.524770e-02
## H.T.deal H.T.deal -2.556237e-02
## S.T.new S.T.new -2.592872e-02
## A.T.new A.T.new -2.597887e-02
## H.T.billion H.T.billion -2.776561e-02
## A.P.metropolitan.diary.colon A.P.metropolitan.diary.colon -2.841404e-02
## S.P.metropolitan.diary.colon S.P.metropolitan.diary.colon -2.841404e-02
## H.T.polit H.T.polit -3.058564e-02
## H.P.verbatim.colon H.P.verbatim.colon -3.194363e-02
## H.T.china H.T.china -3.283653e-02
## H.T.art H.T.art -3.291486e-02
## PubDate.minute.fctr PubDate.minute.fctr -3.407385e-02
## H.T.read H.T.read -3.467043e-02
## S.npnct12.log S.npnct12.log -3.638891e-02
## H.P.today.in.politic H.P.today.in.politic -3.733661e-02
## A.T.year A.T.year -3.741571e-02
## S.T.year S.T.year -3.756011e-02
## A.npnct12.log A.npnct12.log -3.760012e-02
## H.P.what.we.are H.P.what.we.are -3.775209e-02
## A.T.will A.T.will -3.884318e-02
## S.T.will S.T.will -3.888838e-02
## A.T.appear A.T.appear -3.941362e-02
## S.T.appear S.T.appear -3.941362e-02
## PubDate.wkday.fctr PubDate.wkday.fctr -3.980129e-02
## H.T.pictur H.T.pictur -4.003882e-02
## H.T.new H.T.new -4.111696e-02
## A.T.senat A.T.senat -4.139980e-02
## S.T.senat S.T.senat -4.143422e-02
## S.T.show S.T.show -4.182920e-02
## A.T.show A.T.show -4.185292e-02
## H.P.today.in.smallbusiness H.P.today.in.smallbusiness -4.243051e-02
## S.T.day S.T.day -4.262213e-02
## A.T.day A.T.day -4.270831e-02
## H.P.first.draft H.P.first.draft -4.316253e-02
## S.npnct28.log S.npnct28.log -4.370037e-02
## A.npnct28.log A.npnct28.log -4.373349e-02
## A.P.daily.clip.report A.P.daily.clip.report -4.388279e-02
## H.P.daily.clip.report H.P.daily.clip.report -4.388279e-02
## H.T.clip H.T.clip -4.388279e-02
## S.P.daily.clip.report S.P.daily.clip.report -4.388279e-02
## A.T.first A.T.first -4.433630e-02
## H.T.news H.T.news -4.436368e-02
## S.T.first S.T.first -4.447317e-02
## H.T.first H.T.first -4.472902e-02
## H.T.X2014 H.T.X2014 -4.523858e-02
## A.T.newyork A.T.newyork -4.686921e-02
## S.T.newyork S.T.newyork -4.694998e-02
## A.T.report A.T.report -4.774593e-02
## A.T.compani A.T.compani -4.774812e-02
## S.T.report S.T.report -4.779877e-02
## S.T.compani S.T.compani -4.787994e-02
## A.T.word A.T.word -4.821561e-02
## S.T.word S.T.word -4.822452e-02
## H.T.morn H.T.morn -4.838380e-02
## H.T.busi H.T.busi -4.899819e-02
## A.T.newyorktim A.T.newyorktim -4.984782e-02
## S.T.newyorktim S.T.newyorktim -4.985328e-02
## A.npnct13.log A.npnct13.log -4.999563e-02
## A.T.share A.T.share -5.105597e-02
## S.T.share S.T.share -5.105597e-02
## H.npnct04.log H.npnct04.log -5.126277e-02
## S.npnct13.log S.npnct13.log -5.332519e-02
## A.T.articl A.T.articl -5.470831e-02
## S.T.articl S.T.articl -5.471737e-02
## H.T.newyork H.T.newyork -5.564999e-02
## H.T.today H.T.today -5.833786e-02
## H.T.springsumm H.T.springsumm -5.943248e-02
## H.T.day H.T.day -6.044381e-02
## H.npnct14.log H.npnct14.log -6.158577e-02
## A.T.diari A.T.diari -6.229931e-02
## S.T.diari S.T.diari -6.229931e-02
## H.T.report H.T.report -6.238114e-02
## A.npnct04.log A.npnct04.log -6.294642e-02
## S.npnct04.log S.npnct04.log -6.294642e-02
## H.T.daili H.T.daili -6.303731e-02
## H.T.X2015 H.T.X2015 -6.601141e-02
## A.T.herald A.T.herald -6.752419e-02
## S.T.herald S.T.herald -6.752419e-02
## S.npnct15.log S.npnct15.log -6.770952e-02
## H.T.week H.T.week -6.827601e-02
## A.T.photo A.T.photo -6.873838e-02
## S.T.photo S.T.photo -6.874283e-02
## A.npnct15.log A.npnct15.log -6.893301e-02
## A.T.intern A.T.intern -6.953025e-02
## S.T.intern S.T.intern -6.956906e-02
## A.T.tribun A.T.tribun -7.013418e-02
## S.T.tribun S.T.tribun -7.013418e-02
## A.P.fashion.week A.P.fashion.week -7.080716e-02
## S.P.fashion.week S.P.fashion.week -7.080716e-02
## A.T.archiv A.T.archiv -7.202808e-02
## S.T.archiv S.T.archiv -7.202808e-02
## H.P.fashion.week H.P.fashion.week -7.632046e-02
## H.P.year.colon H.P.year.colon -7.842875e-02
## H.T.fashion H.T.fashion -7.947505e-02
## H.npnct15.log H.npnct15.log -8.273237e-02
## A.T.fashion A.T.fashion -8.416793e-02
## S.T.fashion S.T.fashion -8.417159e-02
## A.T.week A.T.week -8.542792e-02
## S.T.week S.T.week -8.552704e-02
## H.nstopwrds.log H.nstopwrds.log -8.657067e-02
## H.npnct28.log H.npnct28.log -8.917338e-02
## S.npnct11.log S.npnct11.log -9.158156e-02
## A.npnct11.log A.npnct11.log -9.183870e-02
## S.nstopwrds.log S.nstopwrds.log -1.148150e-01
## A.nstopwrds.log A.nstopwrds.log -1.153879e-01
## H.ndgts.log H.ndgts.log -1.196633e-01
## S.ndgts.log S.ndgts.log -1.242046e-01
## A.ndgts.log A.ndgts.log -1.249484e-01
## H.nuppr.log H.nuppr.log -1.278085e-01
## H.nwrds.log H.nwrds.log -1.573431e-01
## H.nchrs.log H.nchrs.log -1.710624e-01
## S.nwrds.log S.nwrds.log -1.978341e-01
## A.nwrds.log A.nwrds.log -1.978712e-01
## H.nwrds.unq.log H.nwrds.unq.log -2.014127e-01
## A.nchrs.log A.nchrs.log -2.245488e-01
## S.nchrs.log S.nchrs.log -2.246930e-01
## A.nwrds.unq.log A.nwrds.unq.log -2.460117e-01
## S.nwrds.unq.log S.nwrds.unq.log -2.461670e-01
## S.nuppr.log S.nuppr.log -2.718459e-01
## A.nuppr.log A.nuppr.log -2.720962e-01
## A.npnct05.log A.npnct05.log NA
## A.npnct09.log A.npnct09.log NA
## A.npnct22.log A.npnct22.log NA
## A.npnct26.log A.npnct26.log NA
## A.npnct27.log A.npnct27.log NA
## A.npnct29.log A.npnct29.log NA
## A.npnct30.log A.npnct30.log NA
## H.npnct09.log H.npnct09.log NA
## H.npnct17.log H.npnct17.log NA
## H.npnct18.log H.npnct18.log NA
## H.npnct21.log H.npnct21.log NA
## H.npnct22.log H.npnct22.log NA
## H.npnct23.log H.npnct23.log NA
## H.npnct25.log H.npnct25.log NA
## H.npnct26.log H.npnct26.log NA
## H.npnct27.log H.npnct27.log NA
## H.npnct29.log H.npnct29.log NA
## H.npnct30.log H.npnct30.log NA
## H.P.http H.P.http NA
## PubDate.year.fctr PubDate.year.fctr NA
## S.npnct05.log S.npnct05.log NA
## S.npnct09.log S.npnct09.log NA
## S.npnct17.log S.npnct17.log NA
## S.npnct18.log S.npnct18.log NA
## S.npnct22.log S.npnct22.log NA
## S.npnct25.log S.npnct25.log NA
## S.npnct26.log S.npnct26.log NA
## S.npnct27.log S.npnct27.log NA
## S.npnct29.log S.npnct29.log NA
## S.npnct30.log S.npnct30.log NA
## S.P.http S.P.http NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000e+00
## WordCount.log 0 2.656836e-01
## A.ratio.sum.TfIdf.nwrds 0 2.623865e-01
## S.ratio.sum.TfIdf.nwrds 0 2.622549e-01
## WordCount 1 2.575265e-01
## H.ratio.sum.TfIdf.nwrds 0 2.254527e-01
## .clusterid 1 1.820567e-01
## .clusterid.fctr 0 1.813987e-01
## H.sum.TfIdf 0 1.520414e-01
## S.sum.TfIdf 0 1.484963e-01
## A.sum.TfIdf 0 1.478461e-01
## PubDate.hour.fctr 0 1.354368e-01
## H.npnct19.log 0 1.283641e-01
## A.ratio.nstopwrds.nwrds 0 1.213545e-01
## S.ratio.nstopwrds.nwrds 0 1.206896e-01
## PubDate.wkend 0 1.067288e-01
## H.P.recap.colon 0 9.008096e-02
## H.P.quandary 0 8.734922e-02
## H.P.no.comment.colon 0 6.074669e-02
## S.npnct19.log 0 5.503894e-02
## A.npnct19.log 0 5.482747e-02
## H.P.facts.figures 0 5.410097e-02
## PubDate.last10 1 5.398093e-02
## H.npnct08.log 0 5.375262e-02
## PubDate.last10.log 0 4.931702e-02
## PubDate.last1.log 0 4.635751e-02
## H.P.readers.respond 0 4.432886e-02
## A.T.make 0 4.124187e-02
## S.T.make 0 4.118050e-02
## H.ratio.nstopwrds.nwrds 0 4.024406e-02
## PubDate.last100 1 3.989229e-02
## PubDate.last1 1 3.592267e-02
## H.T.get 0 3.300192e-02
## H.npnct06.log 0 3.190718e-02
## A.npnct01.log 0 3.093101e-02
## S.npnct01.log 0 3.093101e-02
## A.T.can 0 3.083389e-02
## H.npnct16.log 0 3.039622e-02
## S.T.can 0 3.005998e-02
## S.npnct21.log 0 2.760321e-02
## S.npnct23.log 0 2.760321e-02
## H.T.ebola 0 2.682920e-02
## H.npnct01.log 0 2.271577e-02
## PubDate.month.fctr 1 1.914874e-02
## A.T.said 0 1.876762e-02
## S.T.said 0 1.863436e-02
## PubDate.POSIX 1 1.568326e-02
## PubDate.zoo 1 1.568326e-02
## A.npnct21.log 0 1.537569e-02
## A.npnct23.log 0 1.537569e-02
## H.T.make 0 1.349595e-02
## H.npnct11.log 0 1.333613e-02
## myCategory.fctr 0 1.234541e-02
## UniqueID 1 1.182492e-02
## A.T.one 0 1.051414e-02
## S.T.one 0 1.050293e-02
## H.npnct03.log 0 9.533020e-03
## H.P.s.notebook 0 7.755542e-03
## A.npnct24.log 0 9.890046e-19
## H.npnct24.log 0 9.890046e-19
## S.npnct24.log 0 9.890046e-19
## H.T.take 0 1.263270e-03
## A.npnct16.log 0 1.587454e-03
## S.npnct16.log 0 1.587454e-03
## A.T.presid 0 2.090565e-03
## S.T.presid 0 2.381159e-03
## S.npnct08.log 0 3.372706e-03
## A.npnct08.log 0 4.193476e-03
## A.npnct25.log 0 5.547032e-03
## A.npnct10.log 0 5.547032e-03
## H.npnct10.log 0 5.547032e-03
## H.npnct20.log 0 5.547032e-03
## S.npnct02.log 0 5.547032e-03
## S.npnct10.log 0 5.547032e-03
## PubDate.last100.log 0 7.663322e-03
## .rnorm 0 8.244230e-03
## H.npnct05.log 0 9.653967e-03
## H.P.friday.night.music 0 9.653967e-03
## H.T.say 0 9.763205e-03
## H.T.obama 0 9.878461e-03
## H.T.bank 0 1.037439e-02
## PubDate.date.fctr 0 1.164756e-02
## PubDate.second.fctr 0 1.187946e-02
## H.npnct07.log 0 1.201741e-02
## A.npnct07.log 0 1.214357e-02
## S.npnct07.log 0 1.214357e-02
## S.npnct03.log 0 1.240734e-02
## A.npnct18.log 0 1.271661e-02
## A.P.http 0 1.294748e-02
## H.npnct12.log 0 1.305305e-02
## A.npnct03.log 0 1.359260e-02
## H.T.word 0 1.382927e-02
## H.T.big 0 1.438162e-02
## A.npnct02.log 0 1.451467e-02
## A.npnct17.log 0 1.457558e-02
## A.P.year.colon 0 1.755336e-02
## S.P.year.colon 0 1.755336e-02
## S.T.obama 0 1.914281e-02
## A.T.obama 0 1.914924e-02
## A.npnct20.log 0 1.923169e-02
## S.npnct20.log 0 1.923169e-02
## H.npnct02.log 0 2.001851e-02
## H.T.test 0 2.117852e-02
## S.npnct14.log 0 2.121844e-02
## A.P.first.draft 0 2.150663e-02
## H.P.on.this.day 0 2.150663e-02
## S.P.first.draft 0 2.150663e-02
## S.T.take 0 2.264447e-02
## A.T.take 0 2.271897e-02
## A.npnct06.log 0 2.389145e-02
## S.npnct06.log 0 2.389145e-02
## A.npnct14.log 0 2.407715e-02
## S.T.time 0 2.416246e-02
## A.T.time 0 2.430509e-02
## H.T.newyorktim 0 2.514415e-02
## H.npnct13.log 0 2.524770e-02
## H.T.deal 0 2.556237e-02
## S.T.new 0 2.592872e-02
## A.T.new 0 2.597887e-02
## H.T.billion 0 2.776561e-02
## A.P.metropolitan.diary.colon 0 2.841404e-02
## S.P.metropolitan.diary.colon 0 2.841404e-02
## H.T.polit 0 3.058564e-02
## H.P.verbatim.colon 0 3.194363e-02
## H.T.china 0 3.283653e-02
## H.T.art 0 3.291486e-02
## PubDate.minute.fctr 0 3.407385e-02
## H.T.read 0 3.467043e-02
## S.npnct12.log 0 3.638891e-02
## H.P.today.in.politic 0 3.733661e-02
## A.T.year 0 3.741571e-02
## S.T.year 0 3.756011e-02
## A.npnct12.log 0 3.760012e-02
## H.P.what.we.are 0 3.775209e-02
## A.T.will 0 3.884318e-02
## S.T.will 0 3.888838e-02
## A.T.appear 0 3.941362e-02
## S.T.appear 0 3.941362e-02
## PubDate.wkday.fctr 0 3.980129e-02
## H.T.pictur 0 4.003882e-02
## H.T.new 0 4.111696e-02
## A.T.senat 0 4.139980e-02
## S.T.senat 0 4.143422e-02
## S.T.show 0 4.182920e-02
## A.T.show 0 4.185292e-02
## H.P.today.in.smallbusiness 0 4.243051e-02
## S.T.day 0 4.262213e-02
## A.T.day 0 4.270831e-02
## H.P.first.draft 0 4.316253e-02
## S.npnct28.log 0 4.370037e-02
## A.npnct28.log 0 4.373349e-02
## A.P.daily.clip.report 0 4.388279e-02
## H.P.daily.clip.report 0 4.388279e-02
## H.T.clip 0 4.388279e-02
## S.P.daily.clip.report 0 4.388279e-02
## A.T.first 0 4.433630e-02
## H.T.news 0 4.436368e-02
## S.T.first 0 4.447317e-02
## H.T.first 0 4.472902e-02
## H.T.X2014 0 4.523858e-02
## A.T.newyork 0 4.686921e-02
## S.T.newyork 0 4.694998e-02
## A.T.report 0 4.774593e-02
## A.T.compani 0 4.774812e-02
## S.T.report 0 4.779877e-02
## S.T.compani 0 4.787994e-02
## A.T.word 0 4.821561e-02
## S.T.word 0 4.822452e-02
## H.T.morn 0 4.838380e-02
## H.T.busi 0 4.899819e-02
## A.T.newyorktim 0 4.984782e-02
## S.T.newyorktim 0 4.985328e-02
## A.npnct13.log 0 4.999563e-02
## A.T.share 0 5.105597e-02
## S.T.share 0 5.105597e-02
## H.npnct04.log 0 5.126277e-02
## S.npnct13.log 0 5.332519e-02
## A.T.articl 0 5.470831e-02
## S.T.articl 0 5.471737e-02
## H.T.newyork 0 5.564999e-02
## H.T.today 0 5.833786e-02
## H.T.springsumm 0 5.943248e-02
## H.T.day 0 6.044381e-02
## H.npnct14.log 0 6.158577e-02
## A.T.diari 0 6.229931e-02
## S.T.diari 0 6.229931e-02
## H.T.report 0 6.238114e-02
## A.npnct04.log 0 6.294642e-02
## S.npnct04.log 0 6.294642e-02
## H.T.daili 0 6.303731e-02
## H.T.X2015 0 6.601141e-02
## A.T.herald 0 6.752419e-02
## S.T.herald 0 6.752419e-02
## S.npnct15.log 0 6.770952e-02
## H.T.week 0 6.827601e-02
## A.T.photo 0 6.873838e-02
## S.T.photo 0 6.874283e-02
## A.npnct15.log 0 6.893301e-02
## A.T.intern 0 6.953025e-02
## S.T.intern 0 6.956906e-02
## A.T.tribun 0 7.013418e-02
## S.T.tribun 0 7.013418e-02
## A.P.fashion.week 0 7.080716e-02
## S.P.fashion.week 0 7.080716e-02
## A.T.archiv 0 7.202808e-02
## S.T.archiv 0 7.202808e-02
## H.P.fashion.week 0 7.632046e-02
## H.P.year.colon 0 7.842875e-02
## H.T.fashion 0 7.947505e-02
## H.npnct15.log 0 8.273237e-02
## A.T.fashion 0 8.416793e-02
## S.T.fashion 0 8.417159e-02
## A.T.week 0 8.542792e-02
## S.T.week 0 8.552704e-02
## H.nstopwrds.log 0 8.657067e-02
## H.npnct28.log 0 8.917338e-02
## S.npnct11.log 0 9.158156e-02
## A.npnct11.log 0 9.183870e-02
## S.nstopwrds.log 0 1.148150e-01
## A.nstopwrds.log 0 1.153879e-01
## H.ndgts.log 0 1.196633e-01
## S.ndgts.log 0 1.242046e-01
## A.ndgts.log 0 1.249484e-01
## H.nuppr.log 0 1.278085e-01
## H.nwrds.log 0 1.573431e-01
## H.nchrs.log 0 1.710624e-01
## S.nwrds.log 0 1.978341e-01
## A.nwrds.log 0 1.978712e-01
## H.nwrds.unq.log 0 2.014127e-01
## A.nchrs.log 0 2.245488e-01
## S.nchrs.log 0 2.246930e-01
## A.nwrds.unq.log 0 2.460117e-01
## S.nwrds.unq.log 0 2.461670e-01
## S.nuppr.log 0 2.718459e-01
## A.nuppr.log 0 2.720962e-01
## A.npnct05.log 0 NA
## A.npnct09.log 0 NA
## A.npnct22.log 0 NA
## A.npnct26.log 0 NA
## A.npnct27.log 0 NA
## A.npnct29.log 0 NA
## A.npnct30.log 0 NA
## H.npnct09.log 0 NA
## H.npnct17.log 0 NA
## H.npnct18.log 0 NA
## H.npnct21.log 0 NA
## H.npnct22.log 0 NA
## H.npnct23.log 0 NA
## H.npnct25.log 0 NA
## H.npnct26.log 0 NA
## H.npnct27.log 0 NA
## H.npnct29.log 0 NA
## H.npnct30.log 0 NA
## H.P.http 0 NA
## PubDate.year.fctr 0 NA
## S.npnct05.log 0 NA
## S.npnct09.log 0 NA
## S.npnct17.log 0 NA
## S.npnct18.log 0 NA
## S.npnct22.log 0 NA
## S.npnct25.log 0 NA
## S.npnct26.log 0 NA
## S.npnct27.log 0 NA
## S.npnct29.log 0 NA
## S.npnct30.log 0 NA
## S.P.http 0 NA
## cor.high.X freqRatio
## Popular <NA> 4.976212
## WordCount.log <NA> 1.300000
## A.ratio.sum.TfIdf.nwrds A.nstopwrds.log 2.583333
## S.ratio.sum.TfIdf.nwrds <NA> 2.583333
## WordCount <NA> 2.315789
## H.ratio.sum.TfIdf.nwrds <NA> 1.148148
## .clusterid <NA> 16.410959
## .clusterid.fctr <NA> 16.410959
## H.sum.TfIdf <NA> 1.127273
## S.sum.TfIdf A.sum.TfIdf 2.583333
## A.sum.TfIdf <NA> 2.583333
## PubDate.hour.fctr <NA> 1.835040
## H.npnct19.log <NA> 14.995098
## A.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.915094
## S.ratio.nstopwrds.nwrds <NA> 1.908517
## PubDate.wkend <NA> 9.095827
## H.P.recap.colon <NA> 93.666667
## H.P.quandary <NA> 652.200000
## H.P.no.comment.colon <NA> 724.777778
## S.npnct19.log A.npnct19.log 12.862366
## A.npnct19.log <NA> 12.798715
## H.P.facts.figures <NA> 1087.666667
## PubDate.last10 <NA> 1.666667
## H.npnct08.log <NA> 111.620690
## PubDate.last10.log <NA> 1.666667
## PubDate.last1.log <NA> 1.142857
## H.P.readers.respond <NA> 342.789474
## A.T.make S.T.make 273.782609
## S.T.make <NA> 273.782609
## H.ratio.nstopwrds.nwrds <NA> 1.141631
## PubDate.last100 <NA> 25.000000
## PubDate.last1 <NA> 1.142857
## H.T.get <NA> 430.866667
## H.npnct06.log H.npnct16.log 68.935484
## A.npnct01.log S.npnct01.log 309.952381
## S.npnct01.log <NA> 309.952381
## A.T.can S.T.can 261.666667
## H.npnct16.log <NA> 96.104478
## S.T.can <NA> 261.666667
## S.npnct21.log A.npnct21.log 6531.000000
## S.npnct23.log <NA> 6531.000000
## H.T.ebola <NA> 293.000000
## H.npnct01.log <NA> 282.913043
## PubDate.month.fctr <NA> 1.017514
## A.T.said S.T.said 202.516129
## S.T.said <NA> 202.516129
## PubDate.POSIX <NA> 1.000000
## PubDate.zoo <NA> 1.000000
## A.npnct21.log A.npnct23.log 3264.500000
## A.npnct23.log <NA> 3264.500000
## H.T.make <NA> 322.200000
## H.npnct11.log <NA> 4.937442
## myCategory.fctr <NA> 1.337185
## UniqueID <NA> 1.000000
## A.T.one S.T.one 214.931034
## S.T.one <NA> 214.965517
## H.npnct03.log <NA> 2176.333333
## H.P.s.notebook <NA> 815.500000
## A.npnct24.log <NA> 0.000000
## H.npnct24.log <NA> 0.000000
## S.npnct24.log <NA> 0.000000
## H.T.take <NA> 322.250000
## A.npnct16.log <NA> 434.133333
## S.npnct16.log <NA> 434.133333
## A.T.presid <NA> 232.740741
## S.T.presid <NA> 232.740741
## S.npnct08.log <NA> 175.486486
## A.npnct08.log <NA> 170.842105
## A.npnct25.log <NA> 6531.000000
## A.npnct10.log <NA> 6531.000000
## H.npnct10.log <NA> 6531.000000
## H.npnct20.log <NA> 6531.000000
## S.npnct02.log <NA> 6531.000000
## S.npnct10.log <NA> 6531.000000
## PubDate.last100.log <NA> 25.000000
## .rnorm <NA> 2.000000
## H.npnct05.log <NA> 543.333333
## H.P.friday.night.music <NA> 543.333333
## H.T.say <NA> 247.461538
## H.T.obama <NA> 229.750000
## H.T.bank <NA> 214.300000
## PubDate.date.fctr <NA> 1.021394
## PubDate.second.fctr <NA> 1.018204
## H.npnct07.log <NA> 5.437234
## A.npnct07.log S.npnct07.log 1631.750000
## S.npnct07.log <NA> 1631.750000
## S.npnct03.log <NA> 1305.400000
## A.npnct18.log <NA> 1631.500000
## A.P.http A.npnct18.log 1305.200000
## H.npnct12.log <NA> 13.126638
## A.npnct03.log S.npnct03.log 1087.666667
## H.T.word <NA> 104.096774
## H.T.big <NA> 403.562500
## A.npnct02.log A.P.http 1087.500000
## A.npnct17.log A.npnct02.log 1087.500000
## A.P.year.colon S.P.year.colon 652.200000
## S.P.year.colon <NA> 652.200000
## S.T.obama <NA> 398.625000
## A.T.obama S.T.obama 398.625000
## A.npnct20.log S.npnct20.log 543.333333
## S.npnct20.log <NA> 543.333333
## H.npnct02.log <NA> 501.461538
## H.T.test <NA> 306.666667
## S.npnct14.log <NA> 203.062500
## A.P.first.draft S.P.first.draft 434.466667
## H.P.on.this.day <NA> 434.466667
## S.P.first.draft <NA> 434.466667
## S.T.take <NA> 274.608696
## A.T.take S.T.take 274.565217
## A.npnct06.log S.npnct06.log 115.642857
## S.npnct06.log <NA> 115.642857
## A.npnct14.log A.npnct17.log 196.696970
## S.T.time <NA> 217.862069
## A.T.time S.T.time 217.827586
## H.T.newyorktim <NA> 433.266667
## H.npnct13.log <NA> 22.802326
## H.T.deal <NA> 230.428571
## S.T.new <NA> 114.423077
## A.T.new S.T.new 114.403846
## H.T.billion <NA> 214.533333
## A.P.metropolitan.diary.colon S.P.metropolitan.diary.colon 99.492308
## S.P.metropolitan.diary.colon <NA> 99.492308
## H.T.polit <NA> 128.780000
## H.P.verbatim.colon <NA> 196.939394
## H.T.china <NA> 238.407407
## H.T.art <NA> 293.363636
## PubDate.minute.fctr <NA> 1.483365
## H.T.read <NA> 179.388889
## S.npnct12.log <NA> 5.706263
## H.P.today.in.politic H.T.polit 144.155556
## A.T.year <NA> 160.815789
## S.T.year A.T.year 160.815789
## A.npnct12.log S.npnct12.log 5.715368
## H.P.what.we.are H.T.read 141.000000
## A.T.will <NA> 121.734694
## S.T.will A.T.will 119.340000
## A.T.appear H.T.word 228.821429
## S.T.appear <NA> 228.821429
## PubDate.wkday.fctr <NA> 1.003268
## H.T.pictur <NA> 99.230769
## H.T.new <NA> 123.333333
## A.T.senat <NA> 372.294118
## S.T.senat A.T.senat 372.352941
## S.T.show <NA> 274.608696
## A.T.show S.T.show 263.166667
## H.P.today.in.smallbusiness <NA> 111.620690
## S.T.day <NA> 89.528571
## A.T.day S.T.day 89.514286
## H.P.first.draft <NA> 107.866667
## S.npnct28.log <NA> 134.791667
## A.npnct28.log S.npnct28.log 126.862745
## A.P.daily.clip.report H.T.clip 104.354839
## H.P.daily.clip.report <NA> 104.354839
## H.T.clip <NA> 104.354839
## S.P.daily.clip.report <NA> 104.354839
## A.T.first <NA> 225.250000
## H.T.news <NA> 322.000000
## S.T.first A.T.first 225.250000
## H.T.first H.P.first.draft 194.727273
## H.T.X2014 <NA> 110.879310
## A.T.newyork <NA> 149.547619
## S.T.newyork A.T.newyork 149.547619
## A.T.report <NA> 80.371795
## A.T.compani <NA> 137.111111
## S.T.report A.T.report 80.371795
## S.T.compani A.T.compani 137.111111
## A.T.word <NA> 133.125000
## S.T.word A.T.word 133.125000
## H.T.morn A.npnct28.log 165.205128
## H.T.busi <NA> 229.428571
## A.T.newyorktim <NA> 84.540541
## S.T.newyorktim A.T.newyorktim 84.540541
## A.npnct13.log <NA> 4.603330
## A.T.share S.T.share 234.629630
## S.T.share <NA> 234.629630
## H.npnct04.log H.T.billion 38.325301
## S.npnct13.log A.npnct13.log 4.672000
## A.T.articl <NA> 85.500000
## S.T.articl A.T.articl 85.500000
## H.T.newyork <NA> 112.517857
## H.T.today H.P.today.in.politic 138.239130
## H.T.springsumm <NA> 106.966667
## H.T.day <NA> 86.547945
## H.npnct14.log H.T.springsumm 52.983471
## A.T.diari S.T.diari 71.528090
## S.T.diari <NA> 71.528090
## H.T.report <NA> 102.000000
## A.npnct04.log S.npnct04.log 28.536364
## S.npnct04.log <NA> 28.536364
## H.T.daili H.T.report 102.903226
## H.T.X2015 A.T.diari 96.833333
## A.T.herald S.T.herald 144.750000
## S.T.herald <NA> 144.750000
## S.npnct15.log <NA> 13.647191
## H.T.week <NA> 71.352273
## A.T.photo <NA> 70.400000
## S.T.photo A.T.photo 70.400000
## A.npnct15.log S.npnct15.log 13.482222
## A.T.intern <NA> 140.400000
## S.T.intern A.T.intern 140.400000
## A.T.tribun A.T.herald 144.750000
## S.T.tribun <NA> 144.750000
## A.P.fashion.week S.P.fashion.week 40.081761
## S.P.fashion.week <NA> 40.081761
## A.T.archiv S.T.intern 144.545455
## S.T.archiv <NA> 144.545455
## H.P.fashion.week <NA> 34.500000
## H.P.year.colon A.T.archiv 32.670103
## H.T.fashion H.P.fashion.week 76.926829
## H.npnct15.log <NA> 3.914910
## A.T.fashion <NA> 59.245283
## S.T.fashion H.T.X2015 59.245283
## A.T.week <NA> 56.560748
## S.T.week A.T.week 56.560748
## H.nstopwrds.log <NA> 1.370729
## H.npnct28.log <NA> 24.123077
## S.npnct11.log <NA> 1.660473
## A.npnct11.log S.npnct11.log 1.660473
## S.nstopwrds.log <NA> 1.097879
## A.nstopwrds.log S.nstopwrds.log 1.096091
## H.ndgts.log <NA> 13.616137
## S.ndgts.log <NA> 10.511247
## A.ndgts.log S.ndgts.log 10.501022
## H.nuppr.log <NA> 1.033930
## H.nwrds.log <NA> 1.104308
## H.nchrs.log H.nwrds.log 1.023810
## S.nwrds.log <NA> 1.049342
## A.nwrds.log S.nwrds.log 1.052805
## H.nwrds.unq.log H.nuppr.log 1.000000
## A.nchrs.log <NA> 1.328571
## S.nchrs.log A.nwrds.log 1.328571
## A.nwrds.unq.log <NA> 1.054206
## S.nwrds.unq.log S.nchrs.log 1.054206
## S.nuppr.log <NA> 1.152620
## A.nuppr.log S.nuppr.log 1.151308
## A.npnct05.log <NA> 0.000000
## A.npnct09.log <NA> 0.000000
## A.npnct22.log <NA> 0.000000
## A.npnct26.log <NA> 0.000000
## A.npnct27.log <NA> 0.000000
## A.npnct29.log <NA> 0.000000
## A.npnct30.log <NA> 0.000000
## H.npnct09.log <NA> 0.000000
## H.npnct17.log <NA> 0.000000
## H.npnct18.log <NA> 0.000000
## H.npnct21.log <NA> 0.000000
## H.npnct22.log <NA> 0.000000
## H.npnct23.log <NA> 0.000000
## H.npnct25.log <NA> 0.000000
## H.npnct26.log <NA> 0.000000
## H.npnct27.log <NA> 0.000000
## H.npnct29.log <NA> 0.000000
## H.npnct30.log <NA> 0.000000
## H.P.http <NA> 0.000000
## PubDate.year.fctr <NA> 0.000000
## S.npnct05.log <NA> 0.000000
## S.npnct09.log <NA> 0.000000
## S.npnct17.log <NA> 0.000000
## S.npnct18.log <NA> 0.000000
## S.npnct22.log <NA> 0.000000
## S.npnct25.log <NA> 0.000000
## S.npnct26.log <NA> 0.000000
## S.npnct27.log <NA> 0.000000
## S.npnct29.log <NA> 0.000000
## S.npnct30.log <NA> 0.000000
## S.P.http <NA> 0.000000
## percentUnique zeroVar nzv myNearZV
## Popular 0.03061849 FALSE FALSE FALSE
## WordCount.log 24.14268218 FALSE FALSE FALSE
## A.ratio.sum.TfIdf.nwrds 94.51928965 FALSE FALSE FALSE
## S.ratio.sum.TfIdf.nwrds 94.45805266 FALSE FALSE FALSE
## WordCount 24.15799143 FALSE FALSE FALSE
## H.ratio.sum.TfIdf.nwrds 90.46233925 FALSE FALSE FALSE
## .clusterid 1.17881200 FALSE FALSE FALSE
## .clusterid.fctr 1.17881200 FALSE FALSE FALSE
## H.sum.TfIdf 84.44580527 FALSE FALSE FALSE
## S.sum.TfIdf 94.32026944 FALSE FALSE FALSE
## A.sum.TfIdf 94.27434170 FALSE FALSE FALSE
## PubDate.hour.fctr 0.04592774 FALSE FALSE FALSE
## H.npnct19.log 0.06123699 FALSE FALSE FALSE
## A.ratio.nstopwrds.nwrds 4.10287814 FALSE FALSE FALSE
## S.ratio.nstopwrds.nwrds 3.75076546 FALSE FALSE FALSE
## PubDate.wkend 0.03061849 FALSE FALSE FALSE
## H.P.recap.colon 0.03061849 FALSE TRUE FALSE
## H.P.quandary 0.03061849 FALSE TRUE FALSE
## H.P.no.comment.colon 0.03061849 FALSE TRUE FALSE
## S.npnct19.log 0.07654623 FALSE FALSE FALSE
## A.npnct19.log 0.07654623 FALSE FALSE FALSE
## H.P.facts.figures 0.03061849 FALSE TRUE FALSE
## PubDate.last10 79.05695040 FALSE FALSE FALSE
## H.npnct08.log 0.03061849 FALSE TRUE FALSE
## PubDate.last10.log 79.05695040 FALSE FALSE FALSE
## PubDate.last1.log 36.49724434 FALSE FALSE FALSE
## H.P.readers.respond 0.03061849 FALSE TRUE FALSE
## A.T.make 0.44396816 FALSE TRUE FALSE
## S.T.make 0.44396816 FALSE TRUE FALSE
## H.ratio.nstopwrds.nwrds 0.96448255 FALSE FALSE FALSE
## PubDate.last100 92.52908757 FALSE FALSE FALSE
## PubDate.last1 36.49724434 FALSE FALSE FALSE
## H.T.get 0.18371096 FALSE TRUE FALSE
## H.npnct06.log 0.06123699 FALSE TRUE FALSE
## A.npnct01.log 0.06123699 FALSE TRUE FALSE
## S.npnct01.log 0.06123699 FALSE TRUE FALSE
## A.T.can 0.48989590 FALSE TRUE FALSE
## H.npnct16.log 0.06123699 FALSE TRUE FALSE
## S.T.can 0.41334966 FALSE TRUE FALSE
## S.npnct21.log 0.03061849 FALSE TRUE TRUE
## S.npnct23.log 0.03061849 FALSE TRUE TRUE
## H.T.ebola 0.16840171 FALSE TRUE FALSE
## H.npnct01.log 0.04592774 FALSE TRUE FALSE
## PubDate.month.fctr 0.04592774 FALSE FALSE FALSE
## A.T.said 0.41334966 FALSE TRUE FALSE
## S.T.said 0.38273117 FALSE TRUE FALSE
## PubDate.POSIX 99.86221678 FALSE FALSE FALSE
## PubDate.zoo 99.86221678 FALSE FALSE FALSE
## A.npnct21.log 0.04592774 FALSE TRUE TRUE
## A.npnct23.log 0.04592774 FALSE TRUE TRUE
## H.T.make 0.13778322 FALSE TRUE FALSE
## H.npnct11.log 0.07654623 FALSE FALSE FALSE
## myCategory.fctr 0.30618494 FALSE FALSE FALSE
## UniqueID 100.00000000 FALSE FALSE FALSE
## A.T.one 0.48989590 FALSE TRUE FALSE
## S.T.one 0.44396816 FALSE TRUE FALSE
## H.npnct03.log 0.03061849 FALSE TRUE TRUE
## H.P.s.notebook 0.03061849 FALSE TRUE FALSE
## A.npnct24.log 0.01530925 TRUE TRUE TRUE
## H.npnct24.log 0.01530925 TRUE TRUE TRUE
## S.npnct24.log 0.01530925 TRUE TRUE TRUE
## H.T.take 0.15309247 FALSE TRUE FALSE
## A.npnct16.log 0.04592774 FALSE TRUE FALSE
## S.npnct16.log 0.04592774 FALSE TRUE FALSE
## A.T.presid 0.45927740 FALSE TRUE FALSE
## S.T.presid 0.42865891 FALSE TRUE FALSE
## S.npnct08.log 0.06123699 FALSE TRUE FALSE
## A.npnct08.log 0.06123699 FALSE TRUE FALSE
## A.npnct25.log 0.03061849 FALSE TRUE TRUE
## A.npnct10.log 0.03061849 FALSE TRUE TRUE
## H.npnct10.log 0.03061849 FALSE TRUE TRUE
## H.npnct20.log 0.03061849 FALSE TRUE TRUE
## S.npnct02.log 0.03061849 FALSE TRUE TRUE
## S.npnct10.log 0.03061849 FALSE TRUE TRUE
## PubDate.last100.log 92.19228414 FALSE FALSE FALSE
## .rnorm 99.98469075 FALSE FALSE FALSE
## H.npnct05.log 0.03061849 FALSE TRUE FALSE
## H.P.friday.night.music 0.03061849 FALSE TRUE FALSE
## H.T.say 0.16840171 FALSE TRUE FALSE
## H.T.obama 0.16840171 FALSE TRUE FALSE
## H.T.bank 0.13778322 FALSE TRUE FALSE
## PubDate.date.fctr 0.07654623 FALSE FALSE FALSE
## PubDate.second.fctr 0.06123699 FALSE FALSE FALSE
## H.npnct07.log 0.12247397 FALSE FALSE FALSE
## A.npnct07.log 0.04592774 FALSE TRUE FALSE
## S.npnct07.log 0.04592774 FALSE TRUE FALSE
## S.npnct03.log 0.03061849 FALSE TRUE FALSE
## A.npnct18.log 0.06123699 FALSE TRUE FALSE
## A.P.http 0.04592774 FALSE TRUE FALSE
## H.npnct12.log 0.09185548 FALSE FALSE FALSE
## A.npnct03.log 0.03061849 FALSE TRUE FALSE
## H.T.word 0.13778322 FALSE TRUE FALSE
## H.T.big 0.19902021 FALSE TRUE FALSE
## A.npnct02.log 0.04592774 FALSE TRUE FALSE
## A.npnct17.log 0.04592774 FALSE TRUE FALSE
## A.P.year.colon 0.03061849 FALSE TRUE FALSE
## S.P.year.colon 0.03061849 FALSE TRUE FALSE
## S.T.obama 0.38273117 FALSE TRUE FALSE
## A.T.obama 0.42865891 FALSE TRUE FALSE
## A.npnct20.log 0.03061849 FALSE TRUE FALSE
## S.npnct20.log 0.03061849 FALSE TRUE FALSE
## H.npnct02.log 0.03061849 FALSE TRUE FALSE
## H.T.test 0.13778322 FALSE TRUE FALSE
## S.npnct14.log 0.04592774 FALSE TRUE FALSE
## A.P.first.draft 0.03061849 FALSE TRUE FALSE
## H.P.on.this.day 0.03061849 FALSE TRUE FALSE
## S.P.first.draft 0.03061849 FALSE TRUE FALSE
## S.T.take 0.38273117 FALSE TRUE FALSE
## A.T.take 0.42865891 FALSE TRUE FALSE
## A.npnct06.log 0.03061849 FALSE TRUE FALSE
## S.npnct06.log 0.03061849 FALSE TRUE FALSE
## A.npnct14.log 0.10716473 FALSE TRUE FALSE
## S.T.time 0.42865891 FALSE TRUE FALSE
## A.T.time 0.42865891 FALSE TRUE FALSE
## H.T.newyorktim 0.12247397 FALSE TRUE FALSE
## H.npnct13.log 0.12247397 FALSE TRUE FALSE
## H.T.deal 0.13778322 FALSE TRUE FALSE
## S.T.new 0.47458665 FALSE TRUE FALSE
## A.T.new 0.48989590 FALSE TRUE FALSE
## H.T.billion 0.13778322 FALSE TRUE FALSE
## A.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## S.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## H.T.polit 0.13778322 FALSE TRUE FALSE
## H.P.verbatim.colon 0.03061849 FALSE TRUE FALSE
## H.T.china 0.16840171 FALSE TRUE FALSE
## H.T.art 0.19902021 FALSE TRUE FALSE
## PubDate.minute.fctr 0.06123699 FALSE FALSE FALSE
## H.T.read 0.16840171 FALSE TRUE FALSE
## S.npnct12.log 0.09185548 FALSE FALSE FALSE
## H.P.today.in.politic 0.03061849 FALSE TRUE FALSE
## A.T.year 0.48989590 FALSE TRUE FALSE
## S.T.year 0.45927740 FALSE TRUE FALSE
## A.npnct12.log 0.12247397 FALSE FALSE FALSE
## H.P.what.we.are 0.03061849 FALSE TRUE FALSE
## A.T.will 0.59706062 FALSE TRUE FALSE
## S.T.will 0.55113288 FALSE TRUE FALSE
## A.T.appear 0.30618494 FALSE TRUE FALSE
## S.T.appear 0.30618494 FALSE TRUE FALSE
## PubDate.wkday.fctr 0.10716473 FALSE FALSE FALSE
## H.T.pictur 0.10716473 FALSE TRUE FALSE
## H.T.new 0.19902021 FALSE TRUE FALSE
## A.T.senat 0.50520514 FALSE TRUE FALSE
## S.T.senat 0.47458665 FALSE TRUE FALSE
## S.T.show 0.38273117 FALSE TRUE FALSE
## A.T.show 0.39804042 FALSE TRUE FALSE
## H.P.today.in.smallbusiness 0.03061849 FALSE TRUE FALSE
## S.T.day 0.41334966 FALSE TRUE FALSE
## A.T.day 0.44396816 FALSE TRUE FALSE
## H.P.first.draft 0.03061849 FALSE TRUE FALSE
## S.npnct28.log 0.04592774 FALSE TRUE FALSE
## A.npnct28.log 0.04592774 FALSE TRUE FALSE
## A.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## H.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## H.T.clip 0.03061849 FALSE TRUE FALSE
## S.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## A.T.first 0.42865891 FALSE TRUE FALSE
## H.T.news 0.15309247 FALSE TRUE FALSE
## S.T.first 0.41334966 FALSE TRUE FALSE
## H.T.first 0.15309247 FALSE TRUE FALSE
## H.T.X2014 0.13778322 FALSE TRUE FALSE
## A.T.newyork 0.44396816 FALSE TRUE FALSE
## S.T.newyork 0.41334966 FALSE TRUE FALSE
## A.T.report 0.38273117 FALSE TRUE FALSE
## A.T.compani 0.48989590 FALSE TRUE FALSE
## S.T.report 0.35211268 FALSE TRUE FALSE
## S.T.compani 0.44396816 FALSE TRUE FALSE
## A.T.word 0.30618494 FALSE TRUE FALSE
## S.T.word 0.30618494 FALSE TRUE FALSE
## H.T.morn 0.07654623 FALSE TRUE FALSE
## H.T.busi 0.18371096 FALSE TRUE FALSE
## A.T.newyorktim 0.32149418 FALSE TRUE FALSE
## S.T.newyorktim 0.33680343 FALSE TRUE FALSE
## A.npnct13.log 0.16840171 FALSE FALSE FALSE
## A.T.share 0.38273117 FALSE TRUE FALSE
## S.T.share 0.38273117 FALSE TRUE FALSE
## H.npnct04.log 0.04592774 FALSE TRUE FALSE
## S.npnct13.log 0.16840171 FALSE FALSE FALSE
## A.T.articl 0.29087569 FALSE TRUE FALSE
## S.T.articl 0.29087569 FALSE TRUE FALSE
## H.T.newyork 0.15309247 FALSE TRUE FALSE
## H.T.today 0.13778322 FALSE TRUE FALSE
## H.T.springsumm 0.09185548 FALSE TRUE FALSE
## H.T.day 0.18371096 FALSE TRUE FALSE
## H.npnct14.log 0.03061849 FALSE TRUE FALSE
## A.T.diari 0.18371096 FALSE TRUE FALSE
## S.T.diari 0.18371096 FALSE TRUE FALSE
## H.T.report 0.16840171 FALSE TRUE FALSE
## A.npnct04.log 0.07654623 FALSE TRUE FALSE
## S.npnct04.log 0.07654623 FALSE TRUE FALSE
## H.T.daili 0.16840171 FALSE TRUE FALSE
## H.T.X2015 0.10716473 FALSE TRUE FALSE
## A.T.herald 0.24494795 FALSE TRUE FALSE
## S.T.herald 0.24494795 FALSE TRUE FALSE
## S.npnct15.log 0.04592774 FALSE FALSE FALSE
## H.T.week 0.16840171 FALSE TRUE FALSE
## A.T.photo 0.27556644 FALSE TRUE FALSE
## S.T.photo 0.29087569 FALSE TRUE FALSE
## A.npnct15.log 0.04592774 FALSE FALSE FALSE
## A.T.intern 0.32149418 FALSE TRUE FALSE
## S.T.intern 0.30618494 FALSE TRUE FALSE
## A.T.tribun 0.24494795 FALSE TRUE FALSE
## S.T.tribun 0.24494795 FALSE TRUE FALSE
## A.P.fashion.week 0.03061849 FALSE TRUE FALSE
## S.P.fashion.week 0.03061849 FALSE TRUE FALSE
## A.T.archiv 0.24494795 FALSE TRUE FALSE
## S.T.archiv 0.24494795 FALSE TRUE FALSE
## H.P.fashion.week 0.03061849 FALSE TRUE FALSE
## H.P.year.colon 0.03061849 FALSE TRUE FALSE
## H.T.fashion 0.19902021 FALSE TRUE FALSE
## H.npnct15.log 0.04592774 FALSE FALSE FALSE
## A.T.fashion 0.39804042 FALSE TRUE FALSE
## S.T.fashion 0.38273117 FALSE TRUE FALSE
## A.T.week 0.47458665 FALSE TRUE FALSE
## S.T.week 0.41334966 FALSE TRUE FALSE
## H.nstopwrds.log 0.12247397 FALSE FALSE FALSE
## H.npnct28.log 0.03061849 FALSE TRUE FALSE
## S.npnct11.log 0.13778322 FALSE FALSE FALSE
## A.npnct11.log 0.13778322 FALSE FALSE FALSE
## S.nstopwrds.log 0.38273117 FALSE FALSE FALSE
## A.nstopwrds.log 0.42865891 FALSE FALSE FALSE
## H.ndgts.log 0.18371096 FALSE FALSE FALSE
## S.ndgts.log 0.26025720 FALSE FALSE FALSE
## A.ndgts.log 0.29087569 FALSE FALSE FALSE
## H.nuppr.log 0.29087569 FALSE FALSE FALSE
## H.nwrds.log 0.32149418 FALSE FALSE FALSE
## H.nchrs.log 1.57685242 FALSE FALSE FALSE
## S.nwrds.log 0.73484385 FALSE FALSE FALSE
## A.nwrds.log 0.93386405 FALSE FALSE FALSE
## H.nwrds.unq.log 0.21432945 FALSE FALSE FALSE
## A.nchrs.log 4.39375383 FALSE FALSE FALSE
## S.nchrs.log 3.72014697 FALSE FALSE FALSE
## A.nwrds.unq.log 0.55113288 FALSE FALSE FALSE
## S.nwrds.unq.log 0.44396816 FALSE FALSE FALSE
## S.nuppr.log 0.33680343 FALSE FALSE FALSE
## A.nuppr.log 0.33680343 FALSE FALSE FALSE
## A.npnct05.log 0.01530925 TRUE TRUE TRUE
## A.npnct09.log 0.01530925 TRUE TRUE TRUE
## A.npnct22.log 0.01530925 TRUE TRUE TRUE
## A.npnct26.log 0.01530925 TRUE TRUE TRUE
## A.npnct27.log 0.01530925 TRUE TRUE TRUE
## A.npnct29.log 0.01530925 TRUE TRUE TRUE
## A.npnct30.log 0.01530925 TRUE TRUE TRUE
## H.npnct09.log 0.01530925 TRUE TRUE TRUE
## H.npnct17.log 0.01530925 TRUE TRUE TRUE
## H.npnct18.log 0.01530925 TRUE TRUE TRUE
## H.npnct21.log 0.01530925 TRUE TRUE TRUE
## H.npnct22.log 0.01530925 TRUE TRUE TRUE
## H.npnct23.log 0.01530925 TRUE TRUE TRUE
## H.npnct25.log 0.01530925 TRUE TRUE TRUE
## H.npnct26.log 0.01530925 TRUE TRUE TRUE
## H.npnct27.log 0.01530925 TRUE TRUE TRUE
## H.npnct29.log 0.01530925 TRUE TRUE TRUE
## H.npnct30.log 0.01530925 TRUE TRUE TRUE
## H.P.http 0.01530925 TRUE TRUE TRUE
## PubDate.year.fctr 0.01530925 TRUE TRUE TRUE
## S.npnct05.log 0.01530925 TRUE TRUE TRUE
## S.npnct09.log 0.01530925 TRUE TRUE TRUE
## S.npnct17.log 0.01530925 TRUE TRUE TRUE
## S.npnct18.log 0.01530925 TRUE TRUE TRUE
## S.npnct22.log 0.01530925 TRUE TRUE TRUE
## S.npnct25.log 0.01530925 TRUE TRUE TRUE
## S.npnct26.log 0.01530925 TRUE TRUE TRUE
## S.npnct27.log 0.01530925 TRUE TRUE TRUE
## S.npnct29.log 0.01530925 TRUE TRUE TRUE
## S.npnct30.log 0.01530925 TRUE TRUE TRUE
## S.P.http 0.01530925 TRUE TRUE TRUE
## is.cor.y.abs.low
## Popular FALSE
## WordCount.log FALSE
## A.ratio.sum.TfIdf.nwrds FALSE
## S.ratio.sum.TfIdf.nwrds FALSE
## WordCount FALSE
## H.ratio.sum.TfIdf.nwrds FALSE
## .clusterid FALSE
## .clusterid.fctr FALSE
## H.sum.TfIdf FALSE
## S.sum.TfIdf FALSE
## A.sum.TfIdf FALSE
## PubDate.hour.fctr FALSE
## H.npnct19.log FALSE
## A.ratio.nstopwrds.nwrds FALSE
## S.ratio.nstopwrds.nwrds FALSE
## PubDate.wkend FALSE
## H.P.recap.colon FALSE
## H.P.quandary FALSE
## H.P.no.comment.colon FALSE
## S.npnct19.log FALSE
## A.npnct19.log FALSE
## H.P.facts.figures FALSE
## PubDate.last10 FALSE
## H.npnct08.log FALSE
## PubDate.last10.log FALSE
## PubDate.last1.log FALSE
## H.P.readers.respond FALSE
## A.T.make FALSE
## S.T.make FALSE
## H.ratio.nstopwrds.nwrds FALSE
## PubDate.last100 FALSE
## PubDate.last1 FALSE
## H.T.get FALSE
## H.npnct06.log FALSE
## A.npnct01.log FALSE
## S.npnct01.log FALSE
## A.T.can FALSE
## H.npnct16.log FALSE
## S.T.can FALSE
## S.npnct21.log FALSE
## S.npnct23.log FALSE
## H.T.ebola FALSE
## H.npnct01.log FALSE
## PubDate.month.fctr FALSE
## A.T.said FALSE
## S.T.said FALSE
## PubDate.POSIX FALSE
## PubDate.zoo FALSE
## A.npnct21.log FALSE
## A.npnct23.log FALSE
## H.T.make FALSE
## H.npnct11.log FALSE
## myCategory.fctr FALSE
## UniqueID FALSE
## A.T.one FALSE
## S.T.one FALSE
## H.npnct03.log FALSE
## H.P.s.notebook TRUE
## A.npnct24.log TRUE
## H.npnct24.log TRUE
## S.npnct24.log TRUE
## H.T.take TRUE
## A.npnct16.log TRUE
## S.npnct16.log TRUE
## A.T.presid TRUE
## S.T.presid TRUE
## S.npnct08.log TRUE
## A.npnct08.log TRUE
## A.npnct25.log TRUE
## A.npnct10.log TRUE
## H.npnct10.log TRUE
## H.npnct20.log TRUE
## S.npnct02.log TRUE
## S.npnct10.log TRUE
## PubDate.last100.log TRUE
## .rnorm FALSE
## H.npnct05.log FALSE
## H.P.friday.night.music FALSE
## H.T.say FALSE
## H.T.obama FALSE
## H.T.bank FALSE
## PubDate.date.fctr FALSE
## PubDate.second.fctr FALSE
## H.npnct07.log FALSE
## A.npnct07.log FALSE
## S.npnct07.log FALSE
## S.npnct03.log FALSE
## A.npnct18.log FALSE
## A.P.http FALSE
## H.npnct12.log FALSE
## A.npnct03.log FALSE
## H.T.word FALSE
## H.T.big FALSE
## A.npnct02.log FALSE
## A.npnct17.log FALSE
## A.P.year.colon FALSE
## S.P.year.colon FALSE
## S.T.obama FALSE
## A.T.obama FALSE
## A.npnct20.log FALSE
## S.npnct20.log FALSE
## H.npnct02.log FALSE
## H.T.test FALSE
## S.npnct14.log FALSE
## A.P.first.draft FALSE
## H.P.on.this.day FALSE
## S.P.first.draft FALSE
## S.T.take FALSE
## A.T.take FALSE
## A.npnct06.log FALSE
## S.npnct06.log FALSE
## A.npnct14.log FALSE
## S.T.time FALSE
## A.T.time FALSE
## H.T.newyorktim FALSE
## H.npnct13.log FALSE
## H.T.deal FALSE
## S.T.new FALSE
## A.T.new FALSE
## H.T.billion FALSE
## A.P.metropolitan.diary.colon FALSE
## S.P.metropolitan.diary.colon FALSE
## H.T.polit FALSE
## H.P.verbatim.colon FALSE
## H.T.china FALSE
## H.T.art FALSE
## PubDate.minute.fctr FALSE
## H.T.read FALSE
## S.npnct12.log FALSE
## H.P.today.in.politic FALSE
## A.T.year FALSE
## S.T.year FALSE
## A.npnct12.log FALSE
## H.P.what.we.are FALSE
## A.T.will FALSE
## S.T.will FALSE
## A.T.appear FALSE
## S.T.appear FALSE
## PubDate.wkday.fctr FALSE
## H.T.pictur FALSE
## H.T.new FALSE
## A.T.senat FALSE
## S.T.senat FALSE
## S.T.show FALSE
## A.T.show FALSE
## H.P.today.in.smallbusiness FALSE
## S.T.day FALSE
## A.T.day FALSE
## H.P.first.draft FALSE
## S.npnct28.log FALSE
## A.npnct28.log FALSE
## A.P.daily.clip.report FALSE
## H.P.daily.clip.report FALSE
## H.T.clip FALSE
## S.P.daily.clip.report FALSE
## A.T.first FALSE
## H.T.news FALSE
## S.T.first FALSE
## H.T.first FALSE
## H.T.X2014 FALSE
## A.T.newyork FALSE
## S.T.newyork FALSE
## A.T.report FALSE
## A.T.compani FALSE
## S.T.report FALSE
## S.T.compani FALSE
## A.T.word FALSE
## S.T.word FALSE
## H.T.morn FALSE
## H.T.busi FALSE
## A.T.newyorktim FALSE
## S.T.newyorktim FALSE
## A.npnct13.log FALSE
## A.T.share FALSE
## S.T.share FALSE
## H.npnct04.log FALSE
## S.npnct13.log FALSE
## A.T.articl FALSE
## S.T.articl FALSE
## H.T.newyork FALSE
## H.T.today FALSE
## H.T.springsumm FALSE
## H.T.day FALSE
## H.npnct14.log FALSE
## A.T.diari FALSE
## S.T.diari FALSE
## H.T.report FALSE
## A.npnct04.log FALSE
## S.npnct04.log FALSE
## H.T.daili FALSE
## H.T.X2015 FALSE
## A.T.herald FALSE
## S.T.herald FALSE
## S.npnct15.log FALSE
## H.T.week FALSE
## A.T.photo FALSE
## S.T.photo FALSE
## A.npnct15.log FALSE
## A.T.intern FALSE
## S.T.intern FALSE
## A.T.tribun FALSE
## S.T.tribun FALSE
## A.P.fashion.week FALSE
## S.P.fashion.week FALSE
## A.T.archiv FALSE
## S.T.archiv FALSE
## H.P.fashion.week FALSE
## H.P.year.colon FALSE
## H.T.fashion FALSE
## H.npnct15.log FALSE
## A.T.fashion FALSE
## S.T.fashion FALSE
## A.T.week FALSE
## S.T.week FALSE
## H.nstopwrds.log FALSE
## H.npnct28.log FALSE
## S.npnct11.log FALSE
## A.npnct11.log FALSE
## S.nstopwrds.log FALSE
## A.nstopwrds.log FALSE
## H.ndgts.log FALSE
## S.ndgts.log FALSE
## A.ndgts.log FALSE
## H.nuppr.log FALSE
## H.nwrds.log FALSE
## H.nchrs.log FALSE
## S.nwrds.log FALSE
## A.nwrds.log FALSE
## H.nwrds.unq.log FALSE
## A.nchrs.log FALSE
## S.nchrs.log FALSE
## A.nwrds.unq.log FALSE
## S.nwrds.unq.log FALSE
## S.nuppr.log FALSE
## A.nuppr.log FALSE
## A.npnct05.log NA
## A.npnct09.log NA
## A.npnct22.log NA
## A.npnct26.log NA
## A.npnct27.log NA
## A.npnct29.log NA
## A.npnct30.log NA
## H.npnct09.log NA
## H.npnct17.log NA
## H.npnct18.log NA
## H.npnct21.log NA
## H.npnct22.log NA
## H.npnct23.log NA
## H.npnct25.log NA
## H.npnct26.log NA
## H.npnct27.log NA
## H.npnct29.log NA
## H.npnct30.log NA
## H.P.http NA
## PubDate.year.fctr NA
## S.npnct05.log NA
## S.npnct09.log NA
## S.npnct17.log NA
## S.npnct18.log NA
## S.npnct22.log NA
## S.npnct25.log NA
## S.npnct26.log NA
## S.npnct27.log NA
## S.npnct29.log NA
## S.npnct30.log NA
## S.P.http NA
print(myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
colorcol_name="myNearZV", jitter=TRUE) +
geom_point(aes(shape=nzv)) + xlim(-5, 25))
## Warning in myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "myNearZV", : converting myNearZV to class:factor
## Warning in loop_apply(n, do.ply): Removed 16 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 16 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 16 rows containing missing values
## (geom_point).
print(subset(glb_feats_df, myNearZV))
## id cor.y exclude.as.feat
## S.npnct21.log S.npnct21.log 2.760321e-02 0
## S.npnct23.log S.npnct23.log 2.760321e-02 0
## A.npnct21.log A.npnct21.log 1.537569e-02 0
## A.npnct23.log A.npnct23.log 1.537569e-02 0
## H.npnct03.log H.npnct03.log 9.533020e-03 0
## A.npnct24.log A.npnct24.log -9.890046e-19 0
## H.npnct24.log H.npnct24.log -9.890046e-19 0
## S.npnct24.log S.npnct24.log -9.890046e-19 0
## A.npnct25.log A.npnct25.log -5.547032e-03 0
## A.npnct10.log A.npnct10.log -5.547032e-03 0
## H.npnct10.log H.npnct10.log -5.547032e-03 0
## H.npnct20.log H.npnct20.log -5.547032e-03 0
## S.npnct02.log S.npnct02.log -5.547032e-03 0
## S.npnct10.log S.npnct10.log -5.547032e-03 0
## A.npnct05.log A.npnct05.log NA 0
## A.npnct09.log A.npnct09.log NA 0
## A.npnct22.log A.npnct22.log NA 0
## A.npnct26.log A.npnct26.log NA 0
## A.npnct27.log A.npnct27.log NA 0
## A.npnct29.log A.npnct29.log NA 0
## A.npnct30.log A.npnct30.log NA 0
## H.npnct09.log H.npnct09.log NA 0
## H.npnct17.log H.npnct17.log NA 0
## H.npnct18.log H.npnct18.log NA 0
## H.npnct21.log H.npnct21.log NA 0
## H.npnct22.log H.npnct22.log NA 0
## H.npnct23.log H.npnct23.log NA 0
## H.npnct25.log H.npnct25.log NA 0
## H.npnct26.log H.npnct26.log NA 0
## H.npnct27.log H.npnct27.log NA 0
## H.npnct29.log H.npnct29.log NA 0
## H.npnct30.log H.npnct30.log NA 0
## H.P.http H.P.http NA 0
## PubDate.year.fctr PubDate.year.fctr NA 0
## S.npnct05.log S.npnct05.log NA 0
## S.npnct09.log S.npnct09.log NA 0
## S.npnct17.log S.npnct17.log NA 0
## S.npnct18.log S.npnct18.log NA 0
## S.npnct22.log S.npnct22.log NA 0
## S.npnct25.log S.npnct25.log NA 0
## S.npnct26.log S.npnct26.log NA 0
## S.npnct27.log S.npnct27.log NA 0
## S.npnct29.log S.npnct29.log NA 0
## S.npnct30.log S.npnct30.log NA 0
## S.P.http S.P.http NA 0
## cor.y.abs cor.high.X freqRatio percentUnique
## S.npnct21.log 2.760321e-02 A.npnct21.log 6531.000 0.03061849
## S.npnct23.log 2.760321e-02 <NA> 6531.000 0.03061849
## A.npnct21.log 1.537569e-02 A.npnct23.log 3264.500 0.04592774
## A.npnct23.log 1.537569e-02 <NA> 3264.500 0.04592774
## H.npnct03.log 9.533020e-03 <NA> 2176.333 0.03061849
## A.npnct24.log 9.890046e-19 <NA> 0.000 0.01530925
## H.npnct24.log 9.890046e-19 <NA> 0.000 0.01530925
## S.npnct24.log 9.890046e-19 <NA> 0.000 0.01530925
## A.npnct25.log 5.547032e-03 <NA> 6531.000 0.03061849
## A.npnct10.log 5.547032e-03 <NA> 6531.000 0.03061849
## H.npnct10.log 5.547032e-03 <NA> 6531.000 0.03061849
## H.npnct20.log 5.547032e-03 <NA> 6531.000 0.03061849
## S.npnct02.log 5.547032e-03 <NA> 6531.000 0.03061849
## S.npnct10.log 5.547032e-03 <NA> 6531.000 0.03061849
## A.npnct05.log NA <NA> 0.000 0.01530925
## A.npnct09.log NA <NA> 0.000 0.01530925
## A.npnct22.log NA <NA> 0.000 0.01530925
## A.npnct26.log NA <NA> 0.000 0.01530925
## A.npnct27.log NA <NA> 0.000 0.01530925
## A.npnct29.log NA <NA> 0.000 0.01530925
## A.npnct30.log NA <NA> 0.000 0.01530925
## H.npnct09.log NA <NA> 0.000 0.01530925
## H.npnct17.log NA <NA> 0.000 0.01530925
## H.npnct18.log NA <NA> 0.000 0.01530925
## H.npnct21.log NA <NA> 0.000 0.01530925
## H.npnct22.log NA <NA> 0.000 0.01530925
## H.npnct23.log NA <NA> 0.000 0.01530925
## H.npnct25.log NA <NA> 0.000 0.01530925
## H.npnct26.log NA <NA> 0.000 0.01530925
## H.npnct27.log NA <NA> 0.000 0.01530925
## H.npnct29.log NA <NA> 0.000 0.01530925
## H.npnct30.log NA <NA> 0.000 0.01530925
## H.P.http NA <NA> 0.000 0.01530925
## PubDate.year.fctr NA <NA> 0.000 0.01530925
## S.npnct05.log NA <NA> 0.000 0.01530925
## S.npnct09.log NA <NA> 0.000 0.01530925
## S.npnct17.log NA <NA> 0.000 0.01530925
## S.npnct18.log NA <NA> 0.000 0.01530925
## S.npnct22.log NA <NA> 0.000 0.01530925
## S.npnct25.log NA <NA> 0.000 0.01530925
## S.npnct26.log NA <NA> 0.000 0.01530925
## S.npnct27.log NA <NA> 0.000 0.01530925
## S.npnct29.log NA <NA> 0.000 0.01530925
## S.npnct30.log NA <NA> 0.000 0.01530925
## S.P.http NA <NA> 0.000 0.01530925
## zeroVar nzv myNearZV is.cor.y.abs.low
## S.npnct21.log FALSE TRUE TRUE FALSE
## S.npnct23.log FALSE TRUE TRUE FALSE
## A.npnct21.log FALSE TRUE TRUE FALSE
## A.npnct23.log FALSE TRUE TRUE FALSE
## H.npnct03.log FALSE TRUE TRUE FALSE
## A.npnct24.log TRUE TRUE TRUE TRUE
## H.npnct24.log TRUE TRUE TRUE TRUE
## S.npnct24.log TRUE TRUE TRUE TRUE
## A.npnct25.log FALSE TRUE TRUE TRUE
## A.npnct10.log FALSE TRUE TRUE TRUE
## H.npnct10.log FALSE TRUE TRUE TRUE
## H.npnct20.log FALSE TRUE TRUE TRUE
## S.npnct02.log FALSE TRUE TRUE TRUE
## S.npnct10.log FALSE TRUE TRUE TRUE
## A.npnct05.log TRUE TRUE TRUE NA
## A.npnct09.log TRUE TRUE TRUE NA
## A.npnct22.log TRUE TRUE TRUE NA
## A.npnct26.log TRUE TRUE TRUE NA
## A.npnct27.log TRUE TRUE TRUE NA
## A.npnct29.log TRUE TRUE TRUE NA
## A.npnct30.log TRUE TRUE TRUE NA
## H.npnct09.log TRUE TRUE TRUE NA
## H.npnct17.log TRUE TRUE TRUE NA
## H.npnct18.log TRUE TRUE TRUE NA
## H.npnct21.log TRUE TRUE TRUE NA
## H.npnct22.log TRUE TRUE TRUE NA
## H.npnct23.log TRUE TRUE TRUE NA
## H.npnct25.log TRUE TRUE TRUE NA
## H.npnct26.log TRUE TRUE TRUE NA
## H.npnct27.log TRUE TRUE TRUE NA
## H.npnct29.log TRUE TRUE TRUE NA
## H.npnct30.log TRUE TRUE TRUE NA
## H.P.http TRUE TRUE TRUE NA
## PubDate.year.fctr TRUE TRUE TRUE NA
## S.npnct05.log TRUE TRUE TRUE NA
## S.npnct09.log TRUE TRUE TRUE NA
## S.npnct17.log TRUE TRUE TRUE NA
## S.npnct18.log TRUE TRUE TRUE NA
## S.npnct22.log TRUE TRUE TRUE NA
## S.npnct25.log TRUE TRUE TRUE NA
## S.npnct26.log TRUE TRUE TRUE NA
## S.npnct27.log TRUE TRUE TRUE NA
## S.npnct29.log TRUE TRUE TRUE NA
## S.npnct30.log TRUE TRUE TRUE NA
## S.P.http TRUE TRUE TRUE NA
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
subset(glb_feats_df, myNearZV)$id)]
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 8 select.features 5 0 235.859 370.726 134.867
## 9 partition.data.training 6 0 370.726 NA NA
6.0: partition data trainingif (all(is.na(glb_newobs_df[, glb_rsp_var]))) {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=1 - (nrow(glb_newobs_df) * 1.1 / nrow(glb_trnobs_df)))
glb_fitobs_df <- glb_trnobs_df[split, ]
glb_OOBobs_df <- glb_trnobs_df[!split ,]
} else {
print(sprintf("Newdata contains non-NA data for %s; setting OOB to Newdata",
glb_rsp_var))
glb_fitobs_df <- glb_trnobs_df; glb_OOBobs_df <- glb_newobs_df
}
## Loading required package: caTools
if (!is.null(glb_max_fitent_obs) && (nrow(glb_fitobs_df) > glb_max_fitent_obs)) {
warning("glb_fitobs_df restricted to glb_max_fitent_obs: ",
format(glb_max_fitent_obs, big.mark=","))
org_fitent_df <- glb_fitobs_df
glb_fitobs_df <-
org_fitent_df[split <- sample.split(org_fitent_df[, glb_rsp_var_raw],
SplitRatio=glb_max_fitent_obs), ]
org_fitent_df <- NULL
}
glb_allobs_df$.lcn <- ""
glb_allobs_df[glb_allobs_df[, glb_id_vars] %in%
glb_fitobs_df[, glb_id_vars], ".lcn"] <- "Fit"
glb_allobs_df[glb_allobs_df[, glb_id_vars] %in%
glb_OOBobs_df[, glb_id_vars], ".lcn"] <- "OOB"
dsp_class_dstrb <- function(obs_df, location_var, partition_var) {
xtab_df <- mycreate_xtab_df(obs_df, c(location_var, partition_var))
rownames(xtab_df) <- xtab_df[, location_var]
xtab_df <- xtab_df[, -grepl(location_var, names(xtab_df))]
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Ensure proper splits by glb_rsp_var_raw & user-specified feature for OOB vs. new
dsp_class_dstrb(glb_allobs_df, ".lcn", glb_rsp_var_raw)
## Popular.0 Popular.1 Popular.NA
## NA NA 1870
## Fit 3726 749 NA
## OOB 1713 344 NA
## Popular.0 Popular.1 Popular.NA
## NA NA 1
## Fit 0.8326257 0.1673743 NA
## OOB 0.8327662 0.1672338 NA
newent_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .src == "Test"),
"myCategory")
OOBent_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .lcn == "OOB"),
"myCategory")
glb_ctgry_df <- merge(newent_ctgry_df, OOBent_ctgry_df, by="myCategory", all=TRUE,
suffixes=c(".Tst", ".OOB"))
glb_ctgry_df$.freqRatio.Tst <- glb_ctgry_df$.n.Tst / sum(glb_ctgry_df$.n.Tst, na.rm=TRUE)
glb_ctgry_df$.freqRatio.OOB <- glb_ctgry_df$.n.OOB / sum(glb_ctgry_df$.n.OOB, na.rm=TRUE)
print(orderBy(~-.freqRatio.Tst-.freqRatio.OOB, glb_ctgry_df))
## myCategory .n.Tst .n.OOB .freqRatio.Tst
## 1 ## 338 407 0.180748663
## 6 Business#Business Day#Dealbook 304 312 0.162566845
## 10 Culture#Arts# 244 225 0.130481283
## 15 OpEd#Opinion# 164 154 0.087700535
## 9 Business#Technology# 113 114 0.060427807
## 20 TStyle## 105 221 0.056149733
## 5 #U.S.#Education 90 93 0.048128342
## 13 Metro#N.Y. / Region# 67 60 0.035828877
## 18 Styles#U.S.# 62 54 0.033155080
## 16 Science#Health# 57 66 0.030481283
## 12 Foreign#World#Asia Pacific 56 61 0.029946524
## 2 #Multimedia# 52 42 0.027807487
## 11 Foreign#World# 47 47 0.025133690
## 7 Business#Business Day#Small Business 42 45 0.022459893
## 8 Business#Crosswords/Games# 42 40 0.022459893
## 19 Travel#Travel# 35 31 0.018716578
## 3 #Opinion#Room For Debate 24 21 0.012834225
## 17 Styles##Fashion 15 41 0.008021390
## 4 #Opinion#The Public Editor 10 10 0.005347594
## 14 myOther 3 13 0.001604278
## .freqRatio.OOB
## 1 0.197860963
## 6 0.151677200
## 10 0.109382596
## 15 0.074866310
## 9 0.055420515
## 20 0.107438017
## 5 0.045211473
## 13 0.029168692
## 18 0.026251823
## 16 0.032085561
## 12 0.029654837
## 2 0.020418085
## 11 0.022848809
## 7 0.021876519
## 8 0.019445795
## 19 0.015070491
## 3 0.010209042
## 17 0.019931940
## 4 0.004861449
## 14 0.006319883
# Run this line by line
print("glb_feats_df:"); print(dim(glb_feats_df))
## [1] "glb_feats_df:"
## [1] 266 11
sav_feats_df <- glb_feats_df
glb_feats_df <- sav_feats_df
glb_feats_df[, "rsp_var_raw"] <- FALSE
glb_feats_df[glb_feats_df$id == glb_rsp_var_raw, "rsp_var_raw"] <- TRUE
glb_feats_df$exclude.as.feat <- (glb_feats_df$exclude.as.feat == 1)
if (!is.null(glb_id_vars) && glb_id_vars != ".rownames")
glb_feats_df[glb_feats_df$id %in% glb_id_vars, "id_var"] <- TRUE
add_feats_df <- data.frame(id=glb_rsp_var, exclude.as.feat=TRUE, rsp_var=TRUE)
row.names(add_feats_df) <- add_feats_df$id; print(add_feats_df)
## id exclude.as.feat rsp_var
## Popular.fctr Popular.fctr TRUE TRUE
glb_feats_df <- myrbind_df(glb_feats_df, add_feats_df)
print(subset(glb_feats_df, rsp_var_raw | rsp_var | id_var))
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## Popular Popular 1.00000000 TRUE 1.00000000 <NA>
## UniqueID UniqueID 0.01182492 TRUE 0.01182492 <NA>
## Popular.fctr Popular.fctr NA TRUE NA <NA>
## freqRatio percentUnique zeroVar nzv myNearZV
## Popular 4.976212 0.03061849 FALSE FALSE FALSE
## UniqueID 1.000000 100.00000000 FALSE FALSE FALSE
## Popular.fctr NA NA NA NA NA
## is.cor.y.abs.low rsp_var_raw id_var rsp_var
## Popular FALSE TRUE NA NA
## UniqueID FALSE FALSE TRUE NA
## Popular.fctr NA NA NA TRUE
print("glb_feats_df vs. glb_allobs_df: ");
## [1] "glb_feats_df vs. glb_allobs_df: "
print(setdiff(glb_feats_df$id, names(glb_allobs_df)))
## [1] "S.npnct21.log" "S.npnct23.log" "A.npnct21.log"
## [4] "A.npnct23.log" "H.npnct03.log" "A.npnct24.log"
## [7] "H.npnct24.log" "S.npnct24.log" "A.npnct25.log"
## [10] "A.npnct10.log" "H.npnct10.log" "H.npnct20.log"
## [13] "S.npnct02.log" "S.npnct10.log" "A.npnct05.log"
## [16] "A.npnct09.log" "A.npnct22.log" "A.npnct26.log"
## [19] "A.npnct27.log" "A.npnct29.log" "A.npnct30.log"
## [22] "H.npnct09.log" "H.npnct17.log" "H.npnct18.log"
## [25] "H.npnct21.log" "H.npnct22.log" "H.npnct23.log"
## [28] "H.npnct25.log" "H.npnct26.log" "H.npnct27.log"
## [31] "H.npnct29.log" "H.npnct30.log" "H.P.http"
## [34] "PubDate.year.fctr" "S.npnct05.log" "S.npnct09.log"
## [37] "S.npnct17.log" "S.npnct18.log" "S.npnct22.log"
## [40] "S.npnct25.log" "S.npnct26.log" "S.npnct27.log"
## [43] "S.npnct29.log" "S.npnct30.log" "S.P.http"
print("glb_allobs_df vs. glb_feats_df: ");
## [1] "glb_allobs_df vs. glb_feats_df: "
# Ensure these are only chr vars
print(setdiff(setdiff(names(glb_allobs_df), glb_feats_df$id),
myfind_chr_cols_df(glb_allobs_df)))
## character(0)
#print(setdiff(setdiff(names(glb_allobs_df), glb_exclude_vars_as_features),
# glb_feats_df$id))
print("glb_allobs_df: "); print(dim(glb_allobs_df))
## [1] "glb_allobs_df: "
## [1] 8402 232
print("glb_trnobs_df: "); print(dim(glb_trnobs_df))
## [1] "glb_trnobs_df: "
## [1] 6532 276
print("glb_fitobs_df: "); print(dim(glb_fitobs_df))
## [1] "glb_fitobs_df: "
## [1] 4475 276
print("glb_OOBobs_df: "); print(dim(glb_OOBobs_df))
## [1] "glb_OOBobs_df: "
## [1] 2057 276
print("glb_newobs_df: "); print(dim(glb_newobs_df))
## [1] "glb_newobs_df: "
## [1] 1870 276
# # Does not handle NULL or length(glb_id_vars) > 1
# glb_allobs_df$.src.trn <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_vars] %in% glb_trnobs_df[, glb_id_vars],
# ".src.trn"] <- 1
# glb_allobs_df$.src.fit <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_vars] %in% glb_fitobs_df[, glb_id_vars],
# ".src.fit"] <- 1
# glb_allobs_df$.src.OOB <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_vars] %in% glb_OOBobs_df[, glb_id_vars],
# ".src.OOB"] <- 1
# glb_allobs_df$.src.new <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_vars] %in% glb_newobs_df[, glb_id_vars],
# ".src.new"] <- 1
# #print(unique(glb_allobs_df[, ".src.trn"]))
# write_cols <- c(glb_feats_df$id,
# ".src.trn", ".src.fit", ".src.OOB", ".src.new")
# glb_allobs_df <- glb_allobs_df[, write_cols]
#
# tmp_feats_df <- glb_feats_df
# tmp_entity_df <- glb_allobs_df
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
file=paste0(glb_out_pfx, "blddfs_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
# if (!all.equal(tmp_feats_df, glb_feats_df))
# stop("glb_feats_df r/w not working")
# if (!all.equal(tmp_entity_df, glb_allobs_df))
# stop("glb_allobs_df r/w not working")
rm(split)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 9 partition.data.training 6 0 370.726 372.048 1.322
## 10 fit.models 7 0 372.049 NA NA
7.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
# keep_cols <- setdiff(names(glb_allobs_df),
# grep("^.src", names(glb_allobs_df), value=TRUE))
# glb_trnobs_df <- glb_allobs_df[glb_allobs_df$.src.trn == 1, keep_cols]
# glb_fitobs_df <- glb_allobs_df[glb_allobs_df$.src.fit == 1, keep_cols]
# glb_OOBobs_df <- glb_allobs_df[glb_allobs_df$.src.OOB == 1, keep_cols]
# glb_newobs_df <- glb_allobs_df[glb_allobs_df$.src.new == 1, keep_cols]
#
# glb_models_lst <- list(); glb_models_df <- data.frame()
#
if (glb_is_classification && glb_is_binomial &&
(length(unique(glb_fitobs_df[, glb_rsp_var])) < 2))
stop("glb_fitobs_df$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glb_fitobs_df[, glb_rsp_var]), collapse=", "))
max_cor_y_x_var <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low))[1, "id"]
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_var != glb_Baseline_mdl_var) &
(glb_feats_df[max_cor_y_x_var, "cor.y.abs"] >
glb_feats_df[glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_var, " has a lower correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl_fn(model_id="Baseline", model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(model_id="MFO",
model_method=ifelse(glb_is_regression, "lm", "myMFO_classfr"),
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: MFO.myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.8326257 0.1673743
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.8326257 0.1673743
## 2 0.8326257 0.1673743
## 3 0.8326257 0.1673743
## 4 0.8326257 0.1673743
## 5 0.8326257 0.1673743
## 6 0.8326257 0.1673743
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.MFO.myMFO_classfr.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.8326257 0.1673743
## 2 0.8326257 0.1673743
## 3 0.8326257 0.1673743
## 4 0.8326257 0.1673743
## 5 0.8326257 0.1673743
## 6 0.8326257 0.1673743
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.MFO.myMFO_classfr.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 MFO.myMFO_classfr myMFO_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.765 0.003 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326257
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
ret_lst <- myfit_mdl(model_id="Random", model_method="myrandom_classfr",
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Random.myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## [1] "in Random.Classifier$prob"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.2867534
## 3 0.2 0.1746905
## 4 0.3 0.1746905
## 5 0.4 0.1746905
## 6 0.5 0.1746905
## 7 0.6 0.1746905
## 8 0.7 0.1746905
## 9 0.8 0.1746905
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.1000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Random.myrandom_classfr.Y
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 0 3726
## Y 0 749
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1673743 0.0000000 0.1565447 0.1786398 0.8326257
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] " calling mypredict_mdl for OOB:"
## [1] "in Random.Classifier$prob"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.2865473
## 3 0.2 0.1553398
## 4 0.3 0.1553398
## 5 0.4 0.1553398
## 6 0.5 0.1553398
## 7 0.6 0.1553398
## 8 0.7 0.1553398
## 9 0.8 0.1553398
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.1000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Random.myrandom_classfr.Y
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 0 1713
## Y 0 344
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1672338 0.0000000 0.1513467 0.1840753 0.8327662
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## model_id model_method feats max.nTuningRuns
## 1 Random.myrandom_classfr myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.349 0.002 0.5072166
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.2867534 0.1673743
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.1565447 0.1786398 0 0.4877001
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.2865473 0.1672338
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.1513467 0.1840753 0
# Any models that have tuning parameters has "better" results with cross-validation
# (except rf) & "different" results for different outcome metrics
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.cv.0.rpart"
## [1] " indep_vars: A.nuppr.log"
## Loading required package: rpart
## Fitting cp = 0 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0 0 1
##
## Node number 1: 4475 observations
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.rpart.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.rpart.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.rpart rpart A.nuppr.log 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.686 0.057 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326257
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0.cp.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=0,
tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
## [1] "fitting model: Max.cor.Y.cv.0.cp.0.rpart"
## [1] " indep_vars: A.nuppr.log"
## Fitting cp = 0 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0 0 1
##
## Node number 1: 4475 observations
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.cp.0.rpart rpart A.nuppr.log 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.607 0.056 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326257
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.rpart"
## [1] " indep_vars: A.nuppr.log"
## Aggregating results
## Fitting final model on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0 0 1
##
## Node number 1: 4475 observations
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.rpart.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.rpart.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.rpart rpart A.nuppr.log 1
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 1.207 0.057 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326258
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0002791548 0
# Used to compare vs. Interactions.High.cor.Y
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.glm"
## [1] " indep_vars: A.nuppr.log"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3585 -0.6318 -0.4867 -0.3464 2.6336
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.41620 0.11470 3.628 0.000285 ***
## A.nuppr.log -1.38947 0.08027 -17.310 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 3710.6 on 4473 degrees of freedom
## AIC: 3714.6
##
## Number of Fisher Scoring iterations: 5
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.3499729
## 3 0.2 0.3986014
## 4 0.3 0.3121547
## 5 0.4 0.0000000
## 6 0.5 0.0000000
## 7 0.6 0.0000000
## 8 0.7 0.0000000
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.glm.N
## 1 N 2872
## 2 Y 350
## Popular.fctr.predict.Max.cor.Y.glm.Y
## 1 854
## 2 399
## Prediction
## Reference N Y
## N 2872 854
## Y 350 399
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.309497e-01 2.392074e-01 7.176970e-01 7.439004e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.280095e-47
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.3485577
## 3 0.2 0.3880266
## 4 0.3 0.3465046
## 5 0.4 0.0000000
## 6 0.5 0.0000000
## 7 0.6 0.0000000
## 8 0.7 0.0000000
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.glm.N
## 1 N 1330
## 2 Y 169
## Popular.fctr.predict.Max.cor.Y.glm.Y
## 1 383
## 2 175
## Prediction
## Reference N Y
## N 1330 383
## Y 169 175
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.316480e-01 2.283681e-01 7.119353e-01 7.506985e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.236001e-19
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.glm glm A.nuppr.log 1
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 1.228 0.08 0.7073742
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.3986014 0.8324022
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.717697 0.7439004 -0.0004459345 0.710206
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.3880266 0.731648
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.7119353 0.7506985 0.2283681 3714.601
## max.AccuracySD.fit max.KappaSD.fit
## 1 6.48833e-05 0.0007723812
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(unique(glb_feats_df$cor.high.X), NA)) > 0) {
# lm & glm handle interaction terms; rpart & rf do not
if (glb_is_regression || glb_is_binomial) {
indep_vars_vctr <-
c(max_cor_y_x_var, paste(max_cor_y_x_var, int_feats, sep=":"))
} else { indep_vars_vctr <- union(max_cor_y_x_var, int_feats) }
ret_lst <- myfit_mdl(model_id="Interact.High.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
## [1] "fitting model: Interact.High.cor.Y.glm"
## [1] " indep_vars: A.nuppr.log, A.nuppr.log:A.nstopwrds.log, A.nuppr.log:A.sum.TfIdf, A.nuppr.log:S.ratio.nstopwrds.nwrds, A.nuppr.log:A.npnct19.log, A.nuppr.log:S.T.make, A.nuppr.log:H.npnct16.log, A.nuppr.log:S.npnct01.log, A.nuppr.log:S.T.can, A.nuppr.log:A.npnct21.log, A.nuppr.log:S.T.said, A.nuppr.log:A.npnct23.log, A.nuppr.log:S.T.one, A.nuppr.log:S.npnct07.log, A.nuppr.log:A.npnct18.log, A.nuppr.log:S.npnct03.log, A.nuppr.log:A.P.http, A.nuppr.log:A.npnct02.log, A.nuppr.log:S.P.year.colon, A.nuppr.log:S.T.obama, A.nuppr.log:S.npnct20.log, A.nuppr.log:S.P.first.draft, A.nuppr.log:S.T.take, A.nuppr.log:S.npnct06.log, A.nuppr.log:A.npnct17.log, A.nuppr.log:S.T.time, A.nuppr.log:S.T.new, A.nuppr.log:S.P.metropolitan.diary.colon, A.nuppr.log:H.T.polit, A.nuppr.log:A.T.year, A.nuppr.log:S.npnct12.log, A.nuppr.log:H.T.read, A.nuppr.log:A.T.will, A.nuppr.log:H.T.word, A.nuppr.log:A.T.senat, A.nuppr.log:S.T.show, A.nuppr.log:S.T.day, A.nuppr.log:S.npnct28.log, A.nuppr.log:H.T.clip, A.nuppr.log:A.T.first, A.nuppr.log:H.P.first.draft, A.nuppr.log:A.T.newyork, A.nuppr.log:A.T.report, A.nuppr.log:A.T.compani, A.nuppr.log:A.T.word, A.nuppr.log:A.npnct28.log, A.nuppr.log:A.T.newyorktim, A.nuppr.log:S.T.share, A.nuppr.log:H.T.billion, A.nuppr.log:A.npnct13.log, A.nuppr.log:A.T.articl, A.nuppr.log:H.P.today.in.politic, A.nuppr.log:H.T.springsumm, A.nuppr.log:S.T.diari, A.nuppr.log:S.npnct04.log, A.nuppr.log:H.T.report, A.nuppr.log:A.T.diari, A.nuppr.log:S.T.herald, A.nuppr.log:A.T.photo, A.nuppr.log:S.npnct15.log, A.nuppr.log:A.T.intern, A.nuppr.log:A.T.herald, A.nuppr.log:S.P.fashion.week, A.nuppr.log:S.T.intern, A.nuppr.log:A.T.archiv, A.nuppr.log:H.P.fashion.week, A.nuppr.log:H.T.X2015, A.nuppr.log:A.T.week, A.nuppr.log:S.npnct11.log, A.nuppr.log:S.nstopwrds.log, A.nuppr.log:S.ndgts.log, A.nuppr.log:H.nwrds.log, A.nuppr.log:S.nwrds.log, A.nuppr.log:H.nuppr.log, A.nuppr.log:A.nwrds.log, A.nuppr.log:S.nchrs.log, A.nuppr.log:S.nuppr.log"
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: not plotting observations with leverage one:
## 1143, 3637, 4105
## Warning: not plotting observations with leverage one:
## 1143, 3637, 4105
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.71953 -0.64306 -0.32847 -0.00001 3.14361
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -4.430e-01 3.135e-01 -1.413
## A.nuppr.log 5.950e+00 2.009e+00 2.962
## `A.nuppr.log:A.nstopwrds.log` 5.605e+00 7.433e+00 0.754
## `A.nuppr.log:A.sum.TfIdf` 3.486e-02 4.135e-02 0.843
## `A.nuppr.log:S.ratio.nstopwrds.nwrds` -2.482e-01 2.322e+00 -0.107
## `A.nuppr.log:A.npnct19.log` 4.698e-01 1.438e-01 3.268
## `A.nuppr.log:S.T.make` 3.015e-01 3.362e-01 0.897
## `A.nuppr.log:H.npnct16.log` 9.020e-01 2.654e-01 3.399
## `A.nuppr.log:S.npnct01.log` 6.988e-01 4.851e-01 1.441
## `A.nuppr.log:S.T.can` 2.169e-01 4.210e-01 0.515
## `A.nuppr.log:A.npnct21.log` -1.070e+01 6.516e+03 -0.002
## `A.nuppr.log:S.T.said` 8.823e-01 4.084e-01 2.160
## `A.nuppr.log:A.npnct23.log` NA NA NA
## `A.nuppr.log:S.T.one` -2.029e-01 3.662e-01 -0.554
## `A.nuppr.log:S.npnct07.log` -3.609e+01 9.051e+03 -0.004
## `A.nuppr.log:A.npnct18.log` 4.003e+01 2.781e+04 0.001
## `A.nuppr.log:S.npnct03.log` -1.243e+01 4.783e+03 -0.003
## `A.nuppr.log:A.P.http` -2.715e+01 3.054e+04 -0.001
## `A.nuppr.log:A.npnct02.log` -1.101e+01 9.007e+03 -0.001
## `A.nuppr.log:S.P.year.colon` -1.200e+01 2.516e+03 -0.005
## `A.nuppr.log:S.T.obama` -3.129e-01 3.298e-01 -0.949
## `A.nuppr.log:S.npnct20.log` -1.629e+01 4.746e+03 -0.003
## `A.nuppr.log:S.P.first.draft` -8.992e+00 1.931e+03 -0.005
## `A.nuppr.log:S.T.take` -1.048e+00 5.725e-01 -1.831
## `A.nuppr.log:S.npnct06.log` -7.959e-01 8.305e-01 -0.958
## `A.nuppr.log:A.npnct17.log` -8.601e+00 7.947e+03 -0.001
## `A.nuppr.log:S.T.time` -2.576e-01 4.022e-01 -0.640
## `A.nuppr.log:S.T.new` -7.383e-01 4.143e-01 -1.782
## `A.nuppr.log:S.P.metropolitan.diary.colon` -4.464e+00 2.520e+00 -1.772
## `A.nuppr.log:H.T.polit` -4.445e-01 3.604e-01 -1.233
## `A.nuppr.log:A.T.year` -1.348e-01 4.810e-01 -0.280
## `A.nuppr.log:S.npnct12.log` -6.631e-02 1.048e-01 -0.632
## `A.nuppr.log:H.T.read` -4.046e-01 2.294e-01 -1.764
## `A.nuppr.log:A.T.will` -1.200e+00 4.188e-01 -2.866
## `A.nuppr.log:H.T.word` 7.487e-01 2.617e-01 2.861
## `A.nuppr.log:A.T.senat` -5.220e-01 3.534e-01 -1.477
## `A.nuppr.log:S.T.show` -1.853e+00 6.789e-01 -2.730
## `A.nuppr.log:S.T.day` -1.032e+00 5.781e-01 -1.786
## `A.nuppr.log:S.npnct28.log` -1.678e+01 1.734e+04 -0.001
## `A.nuppr.log:H.T.clip` -3.827e+00 6.661e+02 -0.006
## `A.nuppr.log:A.T.first` -3.332e-01 4.970e-01 -0.670
## `A.nuppr.log:H.P.first.draft` -2.088e+01 1.512e+03 -0.014
## `A.nuppr.log:A.T.newyork` 5.102e-01 3.602e-01 1.416
## `A.nuppr.log:A.T.report` -9.710e-01 7.037e-01 -1.380
## `A.nuppr.log:A.T.compani` -1.429e+00 5.790e-01 -2.467
## `A.nuppr.log:A.T.word` -3.546e+00 9.016e-01 -3.933
## `A.nuppr.log:A.npnct28.log` 7.940e+00 1.707e+04 0.000
## `A.nuppr.log:A.T.newyorktim` 2.817e-01 4.479e-01 0.629
## `A.nuppr.log:S.T.share` -1.632e+00 6.584e-01 -2.478
## `A.nuppr.log:H.T.billion` -4.819e-01 5.695e-01 -0.846
## `A.nuppr.log:A.npnct13.log` 6.633e-01 1.122e-01 5.912
## `A.nuppr.log:A.T.articl` -5.779e-01 7.926e-01 -0.729
## `A.nuppr.log:H.P.today.in.politic` -2.055e+01 2.248e+03 -0.009
## `A.nuppr.log:H.T.springsumm` 4.076e+00 2.795e+03 0.001
## `A.nuppr.log:S.T.diari` 8.433e+00 5.428e+00 1.554
## `A.nuppr.log:S.npnct04.log` -1.095e+00 4.291e-01 -2.552
## `A.nuppr.log:H.T.report` -1.029e+00 5.396e-01 -1.907
## `A.nuppr.log:A.T.diari` NA NA NA
## `A.nuppr.log:S.T.herald` -8.523e+00 9.642e+02 -0.009
## `A.nuppr.log:A.T.photo` -2.853e+00 1.297e+00 -2.200
## `A.nuppr.log:S.npnct15.log` -6.868e-02 2.399e-01 -0.286
## `A.nuppr.log:A.T.intern` 5.400e+02 2.076e+05 0.003
## `A.nuppr.log:A.T.herald` NA NA NA
## `A.nuppr.log:S.P.fashion.week` -8.572e+00 6.958e+02 -0.012
## `A.nuppr.log:S.T.intern` -5.407e+02 2.076e+05 -0.003
## `A.nuppr.log:A.T.archiv` -2.904e+01 1.704e+03 -0.017
## `A.nuppr.log:H.P.fashion.week` -1.676e+01 7.518e+02 -0.022
## `A.nuppr.log:H.T.X2015` -2.375e+01 2.865e+03 -0.008
## `A.nuppr.log:A.T.week` -2.183e+00 5.449e-01 -4.007
## `A.nuppr.log:S.npnct11.log` 4.837e-02 7.227e-02 0.669
## `A.nuppr.log:S.nstopwrds.log` -5.059e+00 7.487e+00 -0.676
## `A.nuppr.log:S.ndgts.log` -2.572e-01 7.648e-02 -3.363
## `A.nuppr.log:H.nwrds.log` -6.694e-01 2.045e-01 -3.274
## `A.nuppr.log:S.nwrds.log` 6.064e+00 8.452e+00 0.717
## `A.nuppr.log:H.nuppr.log` 1.287e-03 2.107e-01 0.006
## `A.nuppr.log:A.nwrds.log` -5.395e+00 8.402e+00 -0.642
## `A.nuppr.log:S.nchrs.log` -1.750e+00 3.803e-01 -4.602
## `A.nuppr.log:S.nuppr.log` -3.319e-01 1.837e-01 -1.807
## Pr(>|z|)
## (Intercept) 0.157685
## A.nuppr.log 0.003061 **
## `A.nuppr.log:A.nstopwrds.log` 0.450850
## `A.nuppr.log:A.sum.TfIdf` 0.399250
## `A.nuppr.log:S.ratio.nstopwrds.nwrds` 0.914895
## `A.nuppr.log:A.npnct19.log` 0.001084 **
## `A.nuppr.log:S.T.make` 0.369958
## `A.nuppr.log:H.npnct16.log` 0.000677 ***
## `A.nuppr.log:S.npnct01.log` 0.149711
## `A.nuppr.log:S.T.can` 0.606359
## `A.nuppr.log:A.npnct21.log` 0.998690
## `A.nuppr.log:S.T.said` 0.030757 *
## `A.nuppr.log:A.npnct23.log` NA
## `A.nuppr.log:S.T.one` 0.579580
## `A.nuppr.log:S.npnct07.log` 0.996818
## `A.nuppr.log:A.npnct18.log` 0.998851
## `A.nuppr.log:S.npnct03.log` 0.997926
## `A.nuppr.log:A.P.http` 0.999291
## `A.nuppr.log:A.npnct02.log` 0.999025
## `A.nuppr.log:S.P.year.colon` 0.996196
## `A.nuppr.log:S.T.obama` 0.342716
## `A.nuppr.log:S.npnct20.log` 0.997261
## `A.nuppr.log:S.P.first.draft` 0.996285
## `A.nuppr.log:S.T.take` 0.067062 .
## `A.nuppr.log:S.npnct06.log` 0.337901
## `A.nuppr.log:A.npnct17.log` 0.999136
## `A.nuppr.log:S.T.time` 0.521926
## `A.nuppr.log:S.T.new` 0.074770 .
## `A.nuppr.log:S.P.metropolitan.diary.colon` 0.076402 .
## `A.nuppr.log:H.T.polit` 0.217397
## `A.nuppr.log:A.T.year` 0.779365
## `A.nuppr.log:S.npnct12.log` 0.527125
## `A.nuppr.log:H.T.read` 0.077807 .
## `A.nuppr.log:A.T.will` 0.004159 **
## `A.nuppr.log:H.T.word` 0.004225 **
## `A.nuppr.log:A.T.senat` 0.139685
## `A.nuppr.log:S.T.show` 0.006340 **
## `A.nuppr.log:S.T.day` 0.074110 .
## `A.nuppr.log:S.npnct28.log` 0.999228
## `A.nuppr.log:H.T.clip` 0.995416
## `A.nuppr.log:A.T.first` 0.502570
## `A.nuppr.log:H.P.first.draft` 0.988981
## `A.nuppr.log:A.T.newyork` 0.156705
## `A.nuppr.log:A.T.report` 0.167636
## `A.nuppr.log:A.T.compani` 0.013619 *
## `A.nuppr.log:A.T.word` 8.39e-05 ***
## `A.nuppr.log:A.npnct28.log` 0.999629
## `A.nuppr.log:A.T.newyorktim` 0.529337
## `A.nuppr.log:S.T.share` 0.013206 *
## `A.nuppr.log:H.T.billion` 0.397495
## `A.nuppr.log:A.npnct13.log` 3.38e-09 ***
## `A.nuppr.log:A.T.articl` 0.465920
## `A.nuppr.log:H.P.today.in.politic` 0.992705
## `A.nuppr.log:H.T.springsumm` 0.998837
## `A.nuppr.log:S.T.diari` 0.120250
## `A.nuppr.log:S.npnct04.log` 0.010705 *
## `A.nuppr.log:H.T.report` 0.056567 .
## `A.nuppr.log:A.T.diari` NA
## `A.nuppr.log:S.T.herald` 0.992947
## `A.nuppr.log:A.T.photo` 0.027804 *
## `A.nuppr.log:S.npnct15.log` 0.774646
## `A.nuppr.log:A.T.intern` 0.997924
## `A.nuppr.log:A.T.herald` NA
## `A.nuppr.log:S.P.fashion.week` 0.990171
## `A.nuppr.log:S.T.intern` 0.997922
## `A.nuppr.log:A.T.archiv` 0.986402
## `A.nuppr.log:H.P.fashion.week` 0.982213
## `A.nuppr.log:H.T.X2015` 0.993385
## `A.nuppr.log:A.T.week` 6.14e-05 ***
## `A.nuppr.log:S.npnct11.log` 0.503353
## `A.nuppr.log:S.nstopwrds.log` 0.499191
## `A.nuppr.log:S.ndgts.log` 0.000771 ***
## `A.nuppr.log:H.nwrds.log` 0.001060 **
## `A.nuppr.log:S.nwrds.log` 0.473106
## `A.nuppr.log:H.nuppr.log` 0.995125
## `A.nuppr.log:A.nwrds.log` 0.520769
## `A.nuppr.log:S.nchrs.log` 4.18e-06 ***
## `A.nuppr.log:S.nuppr.log` 0.070795 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 3150.3 on 4400 degrees of freedom
## AIC: 3300.3
##
## Number of Fisher Scoring iterations: 19
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.286753446
## 2 0.1 0.420919176
## 3 0.2 0.481450253
## 4 0.3 0.483463620
## 5 0.4 0.413910093
## 6 0.5 0.320166320
## 7 0.6 0.169856459
## 8 0.7 0.066838046
## 9 0.8 0.010624170
## 10 0.9 0.002666667
## 11 1.0 0.000000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 3214
## 2 Y 347
## Popular.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 512
## 2 402
## Prediction
## Reference N Y
## N 3214 512
## Y 347 402
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.080447e-01 3.670046e-01 7.961944e-01 8.194916e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 9.999931e-01 2.198502e-08
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.28654727
## 2 0.1 0.40549828
## 3 0.2 0.46226415
## 4 0.3 0.46517740
## 5 0.4 0.36981132
## 6 0.5 0.32272727
## 7 0.6 0.16753927
## 8 0.7 0.09392265
## 9 0.8 0.01152738
## 10 0.9 0.00000000
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 1473
## 2 Y 167
## Popular.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 240
## 2 177
## Prediction
## Reference N Y
## N 1473 240
## Y 167 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.802139037 0.345161172 0.784254447 0.819158179 0.832766164
## AccuracyPValue McnemarPValue
## 0.999880076 0.000358473
## model_id model_method
## 1 Interact.High.cor.Y.glm glm
## feats
## 1 A.nuppr.log, A.nuppr.log:A.nstopwrds.log, A.nuppr.log:A.sum.TfIdf, A.nuppr.log:S.ratio.nstopwrds.nwrds, A.nuppr.log:A.npnct19.log, A.nuppr.log:S.T.make, A.nuppr.log:H.npnct16.log, A.nuppr.log:S.npnct01.log, A.nuppr.log:S.T.can, A.nuppr.log:A.npnct21.log, A.nuppr.log:S.T.said, A.nuppr.log:A.npnct23.log, A.nuppr.log:S.T.one, A.nuppr.log:S.npnct07.log, A.nuppr.log:A.npnct18.log, A.nuppr.log:S.npnct03.log, A.nuppr.log:A.P.http, A.nuppr.log:A.npnct02.log, A.nuppr.log:S.P.year.colon, A.nuppr.log:S.T.obama, A.nuppr.log:S.npnct20.log, A.nuppr.log:S.P.first.draft, A.nuppr.log:S.T.take, A.nuppr.log:S.npnct06.log, A.nuppr.log:A.npnct17.log, A.nuppr.log:S.T.time, A.nuppr.log:S.T.new, A.nuppr.log:S.P.metropolitan.diary.colon, A.nuppr.log:H.T.polit, A.nuppr.log:A.T.year, A.nuppr.log:S.npnct12.log, A.nuppr.log:H.T.read, A.nuppr.log:A.T.will, A.nuppr.log:H.T.word, A.nuppr.log:A.T.senat, A.nuppr.log:S.T.show, A.nuppr.log:S.T.day, A.nuppr.log:S.npnct28.log, A.nuppr.log:H.T.clip, A.nuppr.log:A.T.first, A.nuppr.log:H.P.first.draft, A.nuppr.log:A.T.newyork, A.nuppr.log:A.T.report, A.nuppr.log:A.T.compani, A.nuppr.log:A.T.word, A.nuppr.log:A.npnct28.log, A.nuppr.log:A.T.newyorktim, A.nuppr.log:S.T.share, A.nuppr.log:H.T.billion, A.nuppr.log:A.npnct13.log, A.nuppr.log:A.T.articl, A.nuppr.log:H.P.today.in.politic, A.nuppr.log:H.T.springsumm, A.nuppr.log:S.T.diari, A.nuppr.log:S.npnct04.log, A.nuppr.log:H.T.report, A.nuppr.log:A.T.diari, A.nuppr.log:S.T.herald, A.nuppr.log:A.T.photo, A.nuppr.log:S.npnct15.log, A.nuppr.log:A.T.intern, A.nuppr.log:A.T.herald, A.nuppr.log:S.P.fashion.week, A.nuppr.log:S.T.intern, A.nuppr.log:A.T.archiv, A.nuppr.log:H.P.fashion.week, A.nuppr.log:H.T.X2015, A.nuppr.log:A.T.week, A.nuppr.log:S.npnct11.log, A.nuppr.log:S.nstopwrds.log, A.nuppr.log:S.ndgts.log, A.nuppr.log:H.nwrds.log, A.nuppr.log:S.nwrds.log, A.nuppr.log:H.nuppr.log, A.nuppr.log:A.nwrds.log, A.nuppr.log:S.nchrs.log, A.nuppr.log:S.nuppr.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 4.292 1.812
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.8158344 0.3 0.4834636 0.8480454
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.7961944 0.8194916 0.2423449 0.7911694
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.4651774 0.802139
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.7842544 0.8191582 0.3451612 3300.299
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.007207693 0.03117832
# Low.cor.X
# if (glb_is_classification && glb_is_binomial)
# indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
# is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"] else
indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) & !myNearZV &
(exclude.as.feat != 1))[, "id"]
ret_lst <- myfit_mdl(model_id="Low.cor.X",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Low.cor.X.glm"
## [1] " indep_vars: WordCount.log, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, A.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, S.npnct01.log, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, S.T.one, H.P.s.notebook, H.T.take, A.npnct16.log, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, A.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, S.P.year.colon, S.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, S.npnct14.log, H.P.on.this.day, S.P.first.draft, S.T.take, S.npnct06.log, S.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, A.T.year, A.T.will, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.show, H.P.today.in.smallbusiness, S.T.day, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.T.clip, S.P.daily.clip.report, A.T.first, H.T.news, H.T.X2014, A.T.newyork, A.T.report, A.T.compani, A.T.word, H.T.busi, A.T.newyorktim, A.npnct13.log, S.T.share, A.T.articl, H.T.newyork, H.T.springsumm, H.T.day, S.T.diari, H.T.report, S.npnct04.log, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, A.T.intern, S.T.tribun, S.P.fashion.week, S.T.archiv, H.P.fashion.week, H.npnct15.log, A.T.fashion, A.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, S.nwrds.log, A.nchrs.log, A.nwrds.unq.log, S.nuppr.log"
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1668 -0.2703 -0.0764 0.0000 3.9538
##
## Coefficients: (11 not defined because of singularities)
## Estimate
## (Intercept) -5.030e+00
## WordCount.log 1.328e+00
## S.ratio.sum.TfIdf.nwrds 1.352e+00
## H.ratio.sum.TfIdf.nwrds 2.089e-01
## .clusterid.fctr101 -1.514e+00
## .clusterid.fctr102 -1.983e+00
## .clusterid.fctr103 -2.060e+00
## .clusterid.fctr104 -1.689e+00
## .clusterid.fctr401 -6.466e+00
## .clusterid.fctr402 -2.233e+01
## .clusterid.fctr403 -4.802e+00
## .clusterid.fctr404 -4.324e+00
## .clusterid.fctr405 -3.755e+00
## .clusterid.fctr406 -4.902e+00
## .clusterid.fctr407 -2.207e+01
## .clusterid.fctr408 -6.368e+00
## .clusterid.fctr409 -6.186e+00
## .clusterid.fctr410 -2.238e+01
## .clusterid.fctr411 -2.151e+01
## .clusterid.fctr412 -2.222e+01
## .clusterid.fctr413 -2.284e+01
## .clusterid.fctr414 -2.690e+00
## .clusterid.fctr415 -2.239e+01
## .clusterid.fctr501 -3.957e+00
## .clusterid.fctr502 -5.542e+00
## .clusterid.fctr503 -5.543e+00
## .clusterid.fctr504 -5.790e+00
## .clusterid.fctr505 -4.697e+00
## .clusterid.fctr506 -5.054e+00
## .clusterid.fctr507 -5.145e+00
## .clusterid.fctr508 -5.611e+00
## .clusterid.fctr509 -4.861e+00
## .clusterid.fctr510 -4.345e+00
## .clusterid.fctr511 -5.468e+00
## .clusterid.fctr512 -4.409e+00
## .clusterid.fctr513 -4.506e+00
## .clusterid.fctr701 -3.658e+00
## .clusterid.fctr702 -3.950e+00
## .clusterid.fctr703 -2.096e+01
## .clusterid.fctr704 -4.420e+00
## .clusterid.fctr705 -4.503e+00
## .clusterid.fctr706 -2.953e+00
## .clusterid.fctr707 -5.127e+00
## .clusterid.fctr1101 -1.007e+00
## .clusterid.fctr1102 2.173e-01
## .clusterid.fctr1103 1.037e-01
## .clusterid.fctr1104 -1.254e+00
## .clusterid.fctr1105 -1.743e+00
## .clusterid.fctr1106 -1.235e+00
## .clusterid.fctr1107 -3.117e+00
## .clusterid.fctr1108 -1.335e+00
## .clusterid.fctr1109 1.868e+01
## .clusterid.fctr1501 -4.402e+00
## .clusterid.fctr1502 -4.866e+00
## .clusterid.fctr1503 -3.608e+00
## .clusterid.fctr1504 -5.552e+00
## .clusterid.fctr1505 -4.782e+00
## .clusterid.fctr1506 -3.150e+00
## .clusterid.fctr1507 -4.600e+00
## .clusterid.fctr1508 -4.558e+00
## .clusterid.fctr1509 -4.017e+00
## .clusterid.fctr1510 -5.980e+00
## .clusterid.fctr1511 -2.309e+01
## .clusterid.fctr1512 -4.016e+00
## .clusterid.fctr1513 -4.971e+00
## .clusterid.fctr1514 -1.867e+01
## .clusterid.fctr1515 -6.533e+00
## .clusterid.fctr1516 -3.658e+00
## .clusterid.fctr1517 -3.088e+00
## .clusterid.fctr1518 -6.116e+00
## .clusterid.fctr1519 -5.024e+00
## .clusterid.fctr1520 -1.615e+01
## .clusterid.fctr1521 -4.031e+00
## .clusterid.fctr1522 -4.525e+00
## .clusterid.fctr1523 -2.194e+01
## .clusterid.fctr1524 -3.368e+00
## .clusterid.fctr1801 -2.967e+00
## .clusterid.fctr1802 -2.315e+00
## .clusterid.fctr1803 -1.908e+00
## .clusterid.fctr1804 -1.858e+00
## H.sum.TfIdf -1.452e-01
## A.sum.TfIdf -1.593e-01
## `PubDate.hour.fctr(7.67,15.3]` 1.913e-01
## `PubDate.hour.fctr(15.3,23]` 3.653e-01
## H.npnct19.log 1.693e+00
## S.ratio.nstopwrds.nwrds -6.188e+00
## PubDate.wkend -1.808e-01
## H.P.recap.colon 1.840e+00
## H.P.quandary 2.194e+01
## H.P.no.comment.colon 2.040e+00
## A.npnct19.log 1.366e+00
## H.P.facts.figures 9.467e-01
## H.npnct08.log 1.208e+00
## PubDate.last10.log 1.838e-01
## PubDate.last1.log -2.865e-02
## H.P.readers.respond 6.912e+00
## S.T.make -1.305e+00
## H.ratio.nstopwrds.nwrds 5.443e+00
## H.T.get 4.871e-01
## S.npnct01.log 2.462e+00
## H.npnct16.log 5.659e-01
## S.T.can -1.619e+00
## H.T.ebola -1.239e-01
## H.npnct01.log -1.264e+00
## S.T.said 7.922e-01
## H.T.make -3.328e-01
## H.npnct11.log 4.187e-01
## `myCategory.fctrForeign#World#Asia Pacific` -5.996e+00
## `myCategory.fctr#Multimedia#` -6.083e+00
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther -2.268e+01
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` -6.647e-01
## `myCategory.fctrTStyle##` -6.193e+00
## `myCategory.fctrForeign#World#` -2.160e+01
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` -2.275e+01
## `myCategory.fctr#Opinion#Room For Debate` -8.740e+00
## `myCategory.fctr#U.S.#Education` -2.379e+01
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` -3.691e+00
## `myCategory.fctrBusiness#Business Day#Small Business` -6.186e+00
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` -5.764e+00
## `myCategory.fctr#Opinion#The Public Editor` NA
## S.T.one -1.001e+00
## H.P.s.notebook -1.685e+01
## H.T.take -4.009e-01
## A.npnct16.log -6.838e-01
## S.npnct16.log NA
## A.T.presid 4.792e+02
## S.T.presid -4.788e+02
## S.npnct08.log 1.130e+00
## A.npnct08.log NA
## PubDate.last100.log 1.915e-02
## .rnorm -7.205e-02
## H.npnct05.log -2.508e+01
## H.P.friday.night.music -2.342e+00
## H.T.say -4.675e-01
## H.T.obama -1.512e-01
## H.T.bank -6.400e-02
## `PubDate.date.fctr(7,13]` -2.675e-02
## `PubDate.date.fctr(13,19]` -1.162e-01
## `PubDate.date.fctr(19,25]` -1.139e-01
## `PubDate.date.fctr(25,31]` 8.689e-02
## `PubDate.second.fctr(14.8,29.5]` 8.971e-02
## `PubDate.second.fctr(29.5,44.2]` -1.987e-02
## `PubDate.second.fctr(44.2,59.1]` -2.213e-01
## H.npnct07.log 3.248e-01
## S.npnct07.log -2.521e+01
## S.npnct03.log -2.993e+01
## A.npnct18.log -2.802e+01
## H.npnct12.log 4.017e-01
## H.T.word 2.508e+00
## H.T.big -3.376e-01
## S.P.year.colon -1.058e+01
## S.T.obama -8.676e-01
## S.npnct20.log -2.684e+01
## H.npnct02.log -1.859e+01
## H.T.test -1.587e-01
## S.npnct14.log 8.653e-01
## H.P.on.this.day -1.454e+01
## S.P.first.draft -1.548e+01
## S.T.take -1.358e+00
## S.npnct06.log 7.141e-01
## S.T.time -1.124e+00
## H.T.newyorktim -1.221e-01
## H.npnct13.log -2.796e-01
## H.T.deal -2.353e+01
## S.T.new 1.629e-02
## H.T.billion -8.357e-02
## S.P.metropolitan.diary.colon -8.723e+00
## H.T.polit -5.421e-01
## H.P.verbatim.colon -1.454e+01
## H.T.china -9.870e-01
## H.T.art -1.212e+00
## `PubDate.minute.fctr(14.8,29.5]` -2.019e-01
## `PubDate.minute.fctr(29.5,44.2]` -2.394e-01
## `PubDate.minute.fctr(44.2,59.1]` 6.186e-02
## H.T.read -1.092e+00
## S.npnct12.log -1.700e-01
## A.T.year -2.826e-01
## A.T.will -1.028e+00
## S.T.appear -5.878e-01
## PubDate.wkday.fctr1 -2.992e-01
## PubDate.wkday.fctr2 -8.611e-01
## PubDate.wkday.fctr3 -4.455e-01
## PubDate.wkday.fctr4 -6.742e-01
## PubDate.wkday.fctr5 -5.502e-01
## PubDate.wkday.fctr6 -1.061e+00
## H.T.pictur 1.411e-01
## H.T.new -5.904e-01
## A.T.senat 8.068e-01
## S.T.show -1.353e+00
## H.P.today.in.smallbusiness -1.560e+01
## S.T.day -9.731e-01
## H.P.first.draft -1.518e+01
## S.npnct28.log -1.494e+01
## H.P.daily.clip.report -1.570e+01
## H.T.clip NA
## S.P.daily.clip.report NA
## A.T.first 1.077e+00
## H.T.news -6.977e-01
## H.T.X2014 -6.582e-01
## A.T.newyork 2.368e+00
## A.T.report -1.648e+00
## A.T.compani -6.329e-01
## A.T.word -7.426e-01
## H.T.busi -5.133e-01
## A.T.newyorktim 2.318e+00
## A.npnct13.log 1.028e+00
## S.T.share -1.787e+00
## A.T.articl -3.262e+00
## H.T.newyork -8.485e-01
## H.T.springsumm -1.186e+01
## H.T.day -4.700e-01
## S.T.diari 2.075e+01
## H.T.report -1.296e+00
## S.npnct04.log -1.227e+00
## S.T.herald 5.240e+01
## S.npnct15.log 8.196e-02
## H.T.week -8.774e-01
## A.T.photo -2.082e+00
## A.T.intern -3.241e+00
## S.T.tribun -4.993e+01
## S.P.fashion.week 2.553e+00
## S.T.archiv -4.537e+01
## H.P.fashion.week -1.409e+01
## H.npnct15.log -1.630e+00
## A.T.fashion -5.737e+01
## A.T.week -1.993e-01
## H.nstopwrds.log -1.233e+00
## H.npnct28.log -1.532e+00
## S.npnct11.log -1.121e-01
## S.nstopwrds.log 2.222e+00
## H.ndgts.log 6.602e-01
## S.ndgts.log 2.324e-01
## H.nuppr.log 1.569e+00
## H.nwrds.log -3.646e-01
## S.nwrds.log -3.602e-01
## A.nchrs.log -2.296e-01
## A.nwrds.unq.log -1.267e+00
## S.nuppr.log -5.401e-01
## Std. Error z value
## (Intercept) 5.314e+00 -0.947
## WordCount.log 1.139e-01 11.662
## S.ratio.sum.TfIdf.nwrds 6.247e-01 2.164
## H.ratio.sum.TfIdf.nwrds 2.001e-01 1.044
## .clusterid.fctr101 1.555e+00 -0.973
## .clusterid.fctr102 1.584e+00 -1.252
## .clusterid.fctr103 1.684e+00 -1.223
## .clusterid.fctr104 1.714e+00 -0.985
## .clusterid.fctr401 2.015e+00 -3.209
## .clusterid.fctr402 2.623e+03 -0.009
## .clusterid.fctr403 1.934e+00 -2.483
## .clusterid.fctr404 1.710e+00 -2.528
## .clusterid.fctr405 1.632e+00 -2.301
## .clusterid.fctr406 1.863e+00 -2.631
## .clusterid.fctr407 2.563e+03 -0.009
## .clusterid.fctr408 2.030e+00 -3.137
## .clusterid.fctr409 2.038e+00 -3.035
## .clusterid.fctr410 3.419e+03 -0.007
## .clusterid.fctr411 4.105e+03 -0.005
## .clusterid.fctr412 3.749e+03 -0.006
## .clusterid.fctr413 4.528e+03 -0.005
## .clusterid.fctr414 1.803e+00 -1.492
## .clusterid.fctr415 5.275e+03 -0.004
## .clusterid.fctr501 1.555e+00 -2.545
## .clusterid.fctr502 1.629e+00 -3.403
## .clusterid.fctr503 1.665e+00 -3.329
## .clusterid.fctr504 1.692e+00 -3.421
## .clusterid.fctr505 1.903e+00 -2.468
## .clusterid.fctr506 1.611e+00 -3.136
## .clusterid.fctr507 1.665e+00 -3.091
## .clusterid.fctr508 1.882e+00 -2.981
## .clusterid.fctr509 1.714e+00 -2.836
## .clusterid.fctr510 1.691e+00 -2.569
## .clusterid.fctr511 1.880e+00 -2.908
## .clusterid.fctr512 1.700e+00 -2.594
## .clusterid.fctr513 1.723e+00 -2.615
## .clusterid.fctr701 1.587e+00 -2.304
## .clusterid.fctr702 1.614e+00 -2.448
## .clusterid.fctr703 2.585e+03 -0.008
## .clusterid.fctr704 1.626e+00 -2.718
## .clusterid.fctr705 1.644e+00 -2.738
## .clusterid.fctr706 1.587e+00 -1.861
## .clusterid.fctr707 1.913e+00 -2.680
## .clusterid.fctr1101 1.548e+00 -0.650
## .clusterid.fctr1102 1.615e+00 0.135
## .clusterid.fctr1103 1.732e+00 0.060
## .clusterid.fctr1104 1.598e+00 -0.785
## .clusterid.fctr1105 1.603e+00 -1.087
## .clusterid.fctr1106 1.679e+00 -0.736
## .clusterid.fctr1107 1.629e+00 -1.914
## .clusterid.fctr1108 1.641e+00 -0.813
## .clusterid.fctr1109 4.922e+03 0.004
## .clusterid.fctr1501 1.612e+00 -2.732
## .clusterid.fctr1502 1.627e+00 -2.990
## .clusterid.fctr1503 1.579e+00 -2.285
## .clusterid.fctr1504 1.668e+00 -3.328
## .clusterid.fctr1505 1.630e+00 -2.933
## .clusterid.fctr1506 1.817e+00 -1.733
## .clusterid.fctr1507 1.766e+00 -2.605
## .clusterid.fctr1508 1.787e+00 -2.551
## .clusterid.fctr1509 1.617e+00 -2.485
## .clusterid.fctr1510 1.956e+00 -3.057
## .clusterid.fctr1511 3.096e+03 -0.007
## .clusterid.fctr1512 1.714e+00 -2.343
## .clusterid.fctr1513 1.875e+00 -2.650
## .clusterid.fctr1514 2.349e+03 -0.008
## .clusterid.fctr1515 1.948e+00 -3.353
## .clusterid.fctr1516 1.631e+00 -2.243
## .clusterid.fctr1517 1.692e+00 -1.826
## .clusterid.fctr1518 1.909e+00 -3.203
## .clusterid.fctr1519 1.682e+00 -2.987
## .clusterid.fctr1520 3.822e+03 -0.004
## .clusterid.fctr1521 1.686e+00 -2.392
## .clusterid.fctr1522 1.712e+00 -2.643
## .clusterid.fctr1523 4.277e+03 -0.005
## .clusterid.fctr1524 2.375e+00 -1.418
## .clusterid.fctr1801 1.558e+00 -1.904
## .clusterid.fctr1802 1.593e+00 -1.453
## .clusterid.fctr1803 1.668e+00 -1.144
## .clusterid.fctr1804 2.153e+00 -0.863
## H.sum.TfIdf 9.693e-02 -1.498
## A.sum.TfIdf 1.228e-01 -1.297
## `PubDate.hour.fctr(7.67,15.3]` 2.668e-01 0.717
## `PubDate.hour.fctr(15.3,23]` 2.734e-01 1.336
## H.npnct19.log 3.624e-01 4.671
## S.ratio.nstopwrds.nwrds 5.160e+00 -1.199
## PubDate.wkend 4.642e-01 -0.389
## H.P.recap.colon 1.237e+00 1.487
## H.P.quandary 6.383e+03 0.003
## H.P.no.comment.colon 1.126e+00 1.812
## A.npnct19.log 4.109e-01 3.325
## H.P.facts.figures 1.518e+00 0.624
## H.npnct08.log 5.060e-01 2.387
## PubDate.last10.log 1.318e-01 1.394
## PubDate.last1.log 4.869e-02 -0.588
## H.P.readers.respond 1.151e+00 6.003
## S.T.make 6.489e-01 -2.012
## H.ratio.nstopwrds.nwrds 2.872e+00 1.895
## H.T.get 4.354e-01 1.119
## S.npnct01.log 2.204e+00 1.117
## H.npnct16.log 7.020e-01 0.806
## S.T.can 8.908e-01 -1.818
## H.T.ebola 3.304e-01 -0.375
## H.npnct01.log 1.389e+00 -0.910
## S.T.said 9.205e-01 0.861
## H.T.make 3.710e-01 -0.897
## H.npnct11.log 2.266e-01 1.848
## `myCategory.fctrForeign#World#Asia Pacific` 1.661e+00 -3.610
## `myCategory.fctr#Multimedia#` 1.717e+00 -3.542
## `myCategory.fctrCulture#Arts#` NA NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA NA
## myCategory.fctrmyOther 2.984e+03 -0.008
## `myCategory.fctrBusiness#Technology#` NA NA
## `myCategory.fctrBusiness#Crosswords/Games#` 1.597e+00 -0.416
## `myCategory.fctrTStyle##` 1.589e+00 -3.898
## `myCategory.fctrForeign#World#` 1.879e+03 -0.011
## `myCategory.fctrOpEd#Opinion#` NA NA
## `myCategory.fctrStyles##Fashion` 1.546e+03 -0.015
## `myCategory.fctr#Opinion#Room For Debate` 1.750e+00 -4.994
## `myCategory.fctr#U.S.#Education` 9.385e+02 -0.025
## `myCategory.fctr##` NA NA
## `myCategory.fctrMetro#N.Y. / Region#` 1.602e+00 -2.303
## `myCategory.fctrBusiness#Business Day#Small Business` 1.665e+00 -3.714
## `myCategory.fctrStyles#U.S.#` NA NA
## `myCategory.fctrTravel#Travel#` 1.845e+00 -3.124
## `myCategory.fctr#Opinion#The Public Editor` NA NA
## S.T.one 6.924e-01 -1.446
## H.P.s.notebook 8.449e+03 -0.002
## H.T.take 4.984e-01 -0.804
## A.npnct16.log 1.385e+00 -0.494
## S.npnct16.log NA NA
## A.T.presid 1.548e+05 0.003
## S.T.presid 1.548e+05 -0.003
## S.npnct08.log 7.335e-01 1.540
## A.npnct08.log NA NA
## PubDate.last100.log 4.824e-02 0.397
## .rnorm 6.898e-02 -1.044
## H.npnct05.log 9.901e+03 -0.003
## H.P.friday.night.music 1.348e+00 -1.737
## H.T.say 4.615e-01 -1.013
## H.T.obama 4.766e-01 -0.317
## H.T.bank 5.200e-01 -0.123
## `PubDate.date.fctr(7,13]` 2.129e-01 -0.126
## `PubDate.date.fctr(13,19]` 2.123e-01 -0.547
## `PubDate.date.fctr(19,25]` 2.074e-01 -0.549
## `PubDate.date.fctr(25,31]` 2.254e-01 0.386
## `PubDate.second.fctr(14.8,29.5]` 1.903e-01 0.471
## `PubDate.second.fctr(29.5,44.2]` 1.861e-01 -0.107
## `PubDate.second.fctr(44.2,59.1]` 1.935e-01 -1.144
## H.npnct07.log 2.276e-01 1.427
## S.npnct07.log 1.106e+04 -0.002
## S.npnct03.log 8.796e+03 -0.003
## A.npnct18.log 8.725e+03 -0.003
## H.npnct12.log 3.478e-01 1.155
## H.T.word 1.021e+00 2.457
## H.T.big 6.423e-01 -0.526
## S.P.year.colon 3.884e+03 -0.003
## S.T.obama 1.498e+00 -0.579
## S.npnct20.log 7.852e+03 -0.003
## H.npnct02.log 4.924e+03 -0.004
## H.T.test 7.261e-01 -0.218
## S.npnct14.log 1.661e+00 0.521
## H.P.on.this.day 5.627e+03 -0.003
## S.P.first.draft 4.076e+03 -0.004
## S.T.take 1.180e+00 -1.151
## S.npnct06.log 1.564e+00 0.457
## S.T.time 9.718e-01 -1.157
## H.T.newyorktim 8.069e-01 -0.151
## H.npnct13.log 2.435e-01 -1.148
## H.T.deal 2.601e+03 -0.009
## S.T.new 8.046e-01 0.020
## H.T.billion 8.772e-01 -0.095
## S.P.metropolitan.diary.colon 3.957e+00 -2.205
## H.T.polit 4.494e-01 -1.206
## H.P.verbatim.colon 3.727e+03 -0.004
## H.T.china 1.083e+00 -0.911
## H.T.art 1.054e+00 -1.150
## `PubDate.minute.fctr(14.8,29.5]` 1.990e-01 -1.015
## `PubDate.minute.fctr(29.5,44.2]` 1.941e-01 -1.233
## `PubDate.minute.fctr(44.2,59.1]` 1.993e-01 0.310
## H.T.read 4.636e-01 -2.356
## S.npnct12.log 2.309e-01 -0.736
## A.T.year 9.978e-01 -0.283
## A.T.will 9.269e-01 -1.109
## S.T.appear 1.307e+00 -0.450
## PubDate.wkday.fctr1 5.611e-01 -0.533
## PubDate.wkday.fctr2 6.108e-01 -1.410
## PubDate.wkday.fctr3 6.047e-01 -0.737
## PubDate.wkday.fctr4 5.951e-01 -1.133
## PubDate.wkday.fctr5 6.051e-01 -0.909
## PubDate.wkday.fctr6 5.472e-01 -1.939
## H.T.pictur 6.933e-01 0.203
## H.T.new 5.492e-01 -1.075
## A.T.senat 9.282e-01 0.869
## S.T.show 1.240e+00 -1.091
## H.P.today.in.smallbusiness 2.939e+03 -0.005
## S.T.day 1.147e+00 -0.849
## H.P.first.draft 2.072e+03 -0.007
## S.npnct28.log 2.075e+03 -0.007
## H.P.daily.clip.report 2.626e+03 -0.006
## H.T.clip NA NA
## S.P.daily.clip.report NA NA
## A.T.first 1.084e+00 0.994
## H.T.news 8.649e-01 -0.807
## H.T.X2014 1.102e+00 -0.597
## A.T.newyork 1.111e+00 2.131
## A.T.report 1.221e+00 -1.350
## A.T.compani 9.492e-01 -0.667
## A.T.word 1.189e+00 -0.625
## H.T.busi 7.450e-01 -0.689
## A.T.newyorktim 1.349e+00 1.718
## A.npnct13.log 3.069e-01 3.350
## S.T.share 1.157e+00 -1.544
## A.T.articl 2.474e+00 -1.318
## H.T.newyork 5.433e-01 -1.562
## H.T.springsumm 1.072e+03 -0.011
## H.T.day 7.523e-01 -0.625
## S.T.diari 8.374e+00 2.479
## H.T.report 9.315e-01 -1.391
## S.npnct04.log 7.741e-01 -1.585
## S.T.herald 4.354e+03 0.012
## S.npnct15.log 5.822e-01 0.141
## H.T.week 7.729e-01 -1.135
## A.T.photo 2.217e+00 -0.939
## A.T.intern 2.799e+00 -1.158
## S.T.tribun 4.729e+03 -0.011
## S.P.fashion.week 1.217e+03 0.002
## S.T.archiv 3.506e+03 -0.013
## H.P.fashion.week 9.439e+02 -0.015
## H.npnct15.log 4.030e-01 -4.045
## A.T.fashion 2.894e+03 -0.020
## A.T.week 9.754e-01 -0.204
## H.nstopwrds.log 6.491e-01 -1.900
## H.npnct28.log 1.868e+00 -0.820
## S.npnct11.log 1.756e-01 -0.638
## S.nstopwrds.log 1.731e+00 1.284
## H.ndgts.log 2.899e-01 2.277
## S.ndgts.log 2.272e-01 1.023
## H.nuppr.log 7.397e-01 2.121
## H.nwrds.log 1.007e+00 -0.362
## S.nwrds.log 2.021e+00 -0.178
## A.nchrs.log 8.840e-01 -0.260
## A.nwrds.unq.log 9.111e-01 -1.390
## S.nuppr.log 1.850e-01 -2.919
## Pr(>|z|)
## (Intercept) 0.343833
## WordCount.log < 2e-16 ***
## S.ratio.sum.TfIdf.nwrds 0.030476 *
## H.ratio.sum.TfIdf.nwrds 0.296357
## .clusterid.fctr101 0.330396
## .clusterid.fctr102 0.210554
## .clusterid.fctr103 0.221327
## .clusterid.fctr104 0.324628
## .clusterid.fctr401 0.001334 **
## .clusterid.fctr402 0.993206
## .clusterid.fctr403 0.013039 *
## .clusterid.fctr404 0.011460 *
## .clusterid.fctr405 0.021368 *
## .clusterid.fctr406 0.008512 **
## .clusterid.fctr407 0.993131
## .clusterid.fctr408 0.001707 **
## .clusterid.fctr409 0.002402 **
## .clusterid.fctr410 0.994777
## .clusterid.fctr411 0.995819
## .clusterid.fctr412 0.995271
## .clusterid.fctr413 0.995975
## .clusterid.fctr414 0.135718
## .clusterid.fctr415 0.996614
## .clusterid.fctr501 0.010933 *
## .clusterid.fctr502 0.000667 ***
## .clusterid.fctr503 0.000870 ***
## .clusterid.fctr504 0.000624 ***
## .clusterid.fctr505 0.013577 *
## .clusterid.fctr506 0.001712 **
## .clusterid.fctr507 0.001996 **
## .clusterid.fctr508 0.002875 **
## .clusterid.fctr509 0.004573 **
## .clusterid.fctr510 0.010205 *
## .clusterid.fctr511 0.003635 **
## .clusterid.fctr512 0.009488 **
## .clusterid.fctr513 0.008923 **
## .clusterid.fctr701 0.021205 *
## .clusterid.fctr702 0.014385 *
## .clusterid.fctr703 0.993531
## .clusterid.fctr704 0.006561 **
## .clusterid.fctr705 0.006179 **
## .clusterid.fctr706 0.062766 .
## .clusterid.fctr707 0.007358 **
## .clusterid.fctr1101 0.515407
## .clusterid.fctr1102 0.892969
## .clusterid.fctr1103 0.952286
## .clusterid.fctr1104 0.432665
## .clusterid.fctr1105 0.276867
## .clusterid.fctr1106 0.461774
## .clusterid.fctr1107 0.055676 .
## .clusterid.fctr1108 0.415977
## .clusterid.fctr1109 0.996972
## .clusterid.fctr1501 0.006302 **
## .clusterid.fctr1502 0.002787 **
## .clusterid.fctr1503 0.022308 *
## .clusterid.fctr1504 0.000873 ***
## .clusterid.fctr1505 0.003354 **
## .clusterid.fctr1506 0.083040 .
## .clusterid.fctr1507 0.009197 **
## .clusterid.fctr1508 0.010744 *
## .clusterid.fctr1509 0.012963 *
## .clusterid.fctr1510 0.002235 **
## .clusterid.fctr1511 0.994050
## .clusterid.fctr1512 0.019127 *
## .clusterid.fctr1513 0.008039 **
## .clusterid.fctr1514 0.993658
## .clusterid.fctr1515 0.000799 ***
## .clusterid.fctr1516 0.024908 *
## .clusterid.fctr1517 0.067919 .
## .clusterid.fctr1518 0.001358 **
## .clusterid.fctr1519 0.002817 **
## .clusterid.fctr1520 0.996628
## .clusterid.fctr1521 0.016777 *
## .clusterid.fctr1522 0.008227 **
## .clusterid.fctr1523 0.995908
## .clusterid.fctr1524 0.156210
## .clusterid.fctr1801 0.056859 .
## .clusterid.fctr1802 0.146205
## .clusterid.fctr1803 0.252556
## .clusterid.fctr1804 0.388296
## H.sum.TfIdf 0.134221
## A.sum.TfIdf 0.194501
## `PubDate.hour.fctr(7.67,15.3]` 0.473219
## `PubDate.hour.fctr(15.3,23]` 0.181527
## H.npnct19.log 2.99e-06 ***
## S.ratio.nstopwrds.nwrds 0.230446
## PubDate.wkend 0.696943
## H.P.recap.colon 0.136953
## H.P.quandary 0.997258
## H.P.no.comment.colon 0.070046 .
## A.npnct19.log 0.000883 ***
## H.P.facts.figures 0.532807
## H.npnct08.log 0.016968 *
## PubDate.last10.log 0.163285
## PubDate.last1.log 0.556274
## H.P.readers.respond 1.94e-09 ***
## S.T.make 0.044263 *
## H.ratio.nstopwrds.nwrds 0.058098 .
## H.T.get 0.263303
## S.npnct01.log 0.263893
## H.npnct16.log 0.420184
## S.T.can 0.069116 .
## H.T.ebola 0.707742
## H.npnct01.log 0.362822
## S.T.said 0.389408
## H.T.make 0.369807
## H.npnct11.log 0.064651 .
## `myCategory.fctrForeign#World#Asia Pacific` 0.000307 ***
## `myCategory.fctr#Multimedia#` 0.000397 ***
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther 0.993936
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` 0.677317
## `myCategory.fctrTStyle##` 9.68e-05 ***
## `myCategory.fctrForeign#World#` 0.990830
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` 0.988258
## `myCategory.fctr#Opinion#Room For Debate` 5.92e-07 ***
## `myCategory.fctr#U.S.#Education` 0.979774
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` 0.021252 *
## `myCategory.fctrBusiness#Business Day#Small Business` 0.000204 ***
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` 0.001787 **
## `myCategory.fctr#Opinion#The Public Editor` NA
## S.T.one 0.148292
## H.P.s.notebook 0.998409
## H.T.take 0.421198
## A.npnct16.log 0.621389
## S.npnct16.log NA
## A.T.presid 0.997530
## S.T.presid 0.997532
## S.npnct08.log 0.123466
## A.npnct08.log NA
## PubDate.last100.log 0.691411
## .rnorm 0.296258
## H.npnct05.log 0.997979
## H.P.friday.night.music 0.082374 .
## H.T.say 0.311011
## H.T.obama 0.751110
## H.T.bank 0.902043
## `PubDate.date.fctr(7,13]` 0.900034
## `PubDate.date.fctr(13,19]` 0.584051
## `PubDate.date.fctr(19,25]` 0.582937
## `PubDate.date.fctr(25,31]` 0.699823
## `PubDate.second.fctr(14.8,29.5]` 0.637436
## `PubDate.second.fctr(29.5,44.2]` 0.914986
## `PubDate.second.fctr(44.2,59.1]` 0.252627
## H.npnct07.log 0.153483
## S.npnct07.log 0.998181
## S.npnct03.log 0.997285
## A.npnct18.log 0.997438
## H.npnct12.log 0.248099
## H.T.word 0.013995 *
## H.T.big 0.599130
## S.P.year.colon 0.997827
## S.T.obama 0.562423
## S.npnct20.log 0.997273
## H.npnct02.log 0.996988
## H.T.test 0.827055
## S.npnct14.log 0.602484
## H.P.on.this.day 0.997938
## S.P.first.draft 0.996969
## S.T.take 0.249852
## S.npnct06.log 0.647953
## S.T.time 0.247326
## H.T.newyorktim 0.879741
## H.npnct13.log 0.250931
## H.T.deal 0.992782
## S.T.new 0.983846
## H.T.billion 0.924099
## S.P.metropolitan.diary.colon 0.027489 *
## H.T.polit 0.227670
## H.P.verbatim.colon 0.996888
## H.T.china 0.362082
## H.T.art 0.250330
## `PubDate.minute.fctr(14.8,29.5]` 0.310315
## `PubDate.minute.fctr(29.5,44.2]` 0.217656
## `PubDate.minute.fctr(44.2,59.1]` 0.756201
## H.T.read 0.018459 *
## S.npnct12.log 0.461637
## A.T.year 0.777015
## A.T.will 0.267592
## S.T.appear 0.652865
## PubDate.wkday.fctr1 0.593901
## PubDate.wkday.fctr2 0.158574
## PubDate.wkday.fctr3 0.461250
## PubDate.wkday.fctr4 0.257217
## PubDate.wkday.fctr5 0.363265
## PubDate.wkday.fctr6 0.052471 .
## H.T.pictur 0.838754
## H.T.new 0.282327
## A.T.senat 0.384734
## S.T.show 0.275138
## H.P.today.in.smallbusiness 0.995767
## S.T.day 0.396137
## H.P.first.draft 0.994154
## S.npnct28.log 0.994256
## H.P.daily.clip.report 0.995230
## H.T.clip NA
## S.P.daily.clip.report NA
## A.T.first 0.320368
## H.T.news 0.419821
## H.T.X2014 0.550319
## A.T.newyork 0.033102 *
## A.T.report 0.177163
## A.T.compani 0.504928
## A.T.word 0.532127
## H.T.busi 0.490820
## A.T.newyorktim 0.085877 .
## A.npnct13.log 0.000809 ***
## S.T.share 0.122628
## A.T.articl 0.187436
## H.T.newyork 0.118363
## H.T.springsumm 0.991176
## H.T.day 0.532089
## S.T.diari 0.013190 *
## H.T.report 0.164212
## S.npnct04.log 0.112995
## S.T.herald 0.990396
## S.npnct15.log 0.888041
## H.T.week 0.256322
## A.T.photo 0.347719
## A.T.intern 0.246855
## S.T.tribun 0.991576
## S.P.fashion.week 0.998326
## S.T.archiv 0.989677
## H.P.fashion.week 0.988088
## H.npnct15.log 5.24e-05 ***
## A.T.fashion 0.984186
## A.T.week 0.838085
## H.nstopwrds.log 0.057494 .
## H.npnct28.log 0.412179
## S.npnct11.log 0.523182
## S.nstopwrds.log 0.199052
## H.ndgts.log 0.022788 *
## S.ndgts.log 0.306350
## H.nuppr.log 0.033943 *
## H.nwrds.log 0.717309
## S.nwrds.log 0.858556
## A.nchrs.log 0.795038
## A.nwrds.unq.log 0.164418
## S.nuppr.log 0.003509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 1623.0 on 4242 degrees of freedom
## AIC: 2089
##
## Number of Fisher Scoring iterations: 19
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.6859345
## 3 0.2 0.7569405
## 4 0.3 0.7710396
## 5 0.4 0.7758389
## 6 0.5 0.7750177
## 7 0.6 0.7420814
## 8 0.7 0.7008821
## 9 0.8 0.6134969
## 10 0.9 0.4916421
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Low.cor.X.glm.N
## 1 N 3563
## 2 Y 171
## Popular.fctr.predict.Low.cor.X.glm.Y
## 1 163
## 2 578
## Prediction
## Reference N Y
## N 3563 163
## Y 171 578
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.253631e-01 7.310684e-01 9.172705e-01 9.328952e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 6.416734e-75 7.017027e-01
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.6483633
## 3 0.2 0.7130215
## 4 0.3 0.7438017
## 5 0.4 0.7183308
## 6 0.5 0.7062500
## 7 0.6 0.6744966
## 8 0.7 0.6465364
## 9 0.8 0.5681382
## 10 0.9 0.4602151
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Low.cor.X.glm.N
## 1 N 1601
## 2 Y 74
## Popular.fctr.predict.Low.cor.X.glm.Y
## 1 112
## 2 270
## Prediction
## Reference N Y
## N 1601 112
## Y 74 270
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.095771e-01 6.890845e-01 8.963496e-01 9.216231e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.768804e-24 6.668282e-03
## model_id model_method
## 1 Low.cor.X.glm glm
## feats
## 1 WordCount.log, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, A.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, S.npnct01.log, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, S.T.one, H.P.s.notebook, H.T.take, A.npnct16.log, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, A.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, S.P.year.colon, S.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, S.npnct14.log, H.P.on.this.day, S.P.first.draft, S.T.take, S.npnct06.log, S.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, A.T.year, A.T.will, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.show, H.P.today.in.smallbusiness, S.T.day, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.T.clip, S.P.daily.clip.report, A.T.first, H.T.news, H.T.X2014, A.T.newyork, A.T.report, A.T.compani, A.T.word, H.T.busi, A.T.newyorktim, A.npnct13.log, S.T.share, A.T.articl, H.T.newyork, H.T.springsumm, H.T.day, S.T.diari, H.T.report, S.npnct04.log, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, A.T.intern, S.T.tribun, S.P.fashion.week, S.T.archiv, H.P.fashion.week, H.npnct15.log, A.T.fashion, A.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, S.nwrds.log, A.nchrs.log, A.nwrds.unq.log, S.nuppr.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 19.897 9.476
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.9622452 0.4 0.7758389 0.8905043
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.9172705 0.9328952 0.5860953 0.9229354
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.7438017 0.9095771
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.8963496 0.9216231 0.6890845 2088.981
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.02193724 0.08903616
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 10 fit.models 7 0 372.049 425.868 53.82
## 11 fit.models 7 1 425.869 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn")
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 430.046 NA NA
# Options:
# 1. rpart & rf manual tuning
# 2. rf without pca (default: with pca)
# All X that is not user excluded
# if (glb_is_classification && glb_is_binomial) {
# model_id_pfx <- "Conditional.X"
# # indep_vars_vctr <- setdiff(names(glb_fitobs_df), union(glb_rsp_var, glb_exclude_vars_as_features))
# indep_vars_vctr <- subset(glb_feats_df, is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"]
# } else {
model_id_pfx <- "All.X"
indep_vars_vctr <- subset(glb_feats_df, !myNearZV &
(exclude.as.feat != 1))[, "id"]
# }
for (method in glb_models_method_vctr) {
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", method), major.inc=TRUE)
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars_vctr <- setdiff(indep_vars_vctr, c(".rnorm"))
model_id <- paste0(model_id_pfx, ".no.rnorm")
} else model_id <- model_id_pfx
if (method %in% c("glm")) # for a "robust" glm model
indep_vars_vctr <- setdiff(indep_vars_vctr, c(NULL
,"A.nchrs.log" # correlated to "S.*"
,"A.ndgts.log" # correlated to "S.*"
,"A.nuppr.log" # correlated to "S.*"
,"A.npnct01.log" # identical to "S.npnct01.log"
,"A.npnct03.log" # correlated to "S.npnct03.log"
,"A.npnct04.log" # correlated to "S.npnct04.log"
,"A.npnct06.log" # identical to "S.npnct06.log"
,"A.npnct07.log" # identical to "S.npnct07.log"
,"A.npnct08.log" # correlated to "S.npnct08.log"
,"A.npnct11.log" # correlated to "S.*"
,"A.npnct12.log" # correlated to "S.*"
,"S.npnct14.log" # correlated to "A.*"
,"A.npnct15.log" # correlated to "S.npnct15.log"
,"A.npnct16.log" # correlated to "S.npnct16.log"
,"A.npnct19.log" # correlated to "S.*"
,"A.npnct20.log" # identical to "S.npnct20.log"
,"A.npnct21.log" # correlated to "S.npnct21.log"
,"A.P.daily.clip.report" # identical to "S.*"
,"S.P.daily.clip.report" # identical to "H.*"
,"A.P.http" # correlated to "A.npnct14.log"
,"A.P.fashion.week" # identical to "S.*"
,"H.P.first.draft" # correlated to "H.T.first"
,"A.P.first.draft" # identical to "S.*"
,"A.P.metropolitan.diary.colon" # identical to "S.*"
,"A.P.year.colon" # identical to "S.P.year.colon"
))
ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# If All.X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(model_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glb_fitobs_df[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE), ]
# all.equal(glb_allobs_df$S.nuppr.log, glb_allobs_df$A.nuppr.log)
# all.equal(glb_allobs_df$S.npnct19.log, glb_allobs_df$A.npnct19.log)
# all.equal(glb_allobs_df$S.P.year.colon, glb_allobs_df$A.P.year.colon)
# all.equal(glb_allobs_df$S.T.share, glb_allobs_df$A.T.share)
# all.equal(glb_allobs_df$H.T.clip, glb_allobs_df$H.P.daily.clip.report)
# cor(glb_allobs_df$S.T.herald, glb_allobs_df$S.T.tribun)
# dsp_obs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# dsp_obs(Abstract.contains="[Ss]hare", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glb_allobs_df[, setdiff(names(glb_allobs_df), myfind_chr_cols_df(glb_allobs_df))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(model_id=paste0(model_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
}
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 430.046 430.059 0.013
## 2 fit.models_1_glm 2 0 430.059 NA NA
## [1] "fitting model: All.X.glm"
## [1] " indep_vars: WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log"
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: not plotting observations with leverage one:
## 129, 3511, 3846, 3953
## Warning: not plotting observations with leverage one:
## 129, 3511, 3846, 3953
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1984 -0.2546 -0.0633 0.0000 3.7454
##
## Coefficients: (15 not defined because of singularities)
## Estimate
## (Intercept) -3.212e+00
## WordCount.log 1.333e+00
## A.ratio.sum.TfIdf.nwrds -1.837e+02
## S.ratio.sum.TfIdf.nwrds 1.851e+02
## H.ratio.sum.TfIdf.nwrds 2.176e-01
## .clusterid.fctr101 -2.049e+00
## .clusterid.fctr102 -2.473e+00
## .clusterid.fctr103 -2.509e+00
## .clusterid.fctr104 -2.471e+00
## .clusterid.fctr401 -6.552e+00
## .clusterid.fctr402 -2.493e+01
## .clusterid.fctr403 -5.091e+00
## .clusterid.fctr404 -4.836e+00
## .clusterid.fctr405 -4.178e+00
## .clusterid.fctr406 -5.540e+00
## .clusterid.fctr407 -2.452e+01
## .clusterid.fctr408 -6.677e+00
## .clusterid.fctr409 -6.459e+00
## .clusterid.fctr410 -2.475e+01
## .clusterid.fctr411 -2.399e+01
## .clusterid.fctr412 -2.472e+01
## .clusterid.fctr413 -2.527e+01
## .clusterid.fctr414 -3.039e+00
## .clusterid.fctr415 -1.867e+01
## .clusterid.fctr501 -4.322e+00
## .clusterid.fctr502 -5.883e+00
## .clusterid.fctr503 -5.975e+00
## .clusterid.fctr504 -6.320e+00
## .clusterid.fctr505 -5.208e+00
## .clusterid.fctr506 -5.437e+00
## .clusterid.fctr507 -5.622e+00
## .clusterid.fctr508 -5.942e+00
## .clusterid.fctr509 -5.502e+00
## .clusterid.fctr510 -4.669e+00
## .clusterid.fctr511 -6.058e+00
## .clusterid.fctr512 -4.747e+00
## .clusterid.fctr513 -4.992e+00
## .clusterid.fctr701 -4.224e+00
## .clusterid.fctr702 -4.328e+00
## .clusterid.fctr703 -2.308e+01
## .clusterid.fctr704 -4.823e+00
## .clusterid.fctr705 -4.935e+00
## .clusterid.fctr706 -3.374e+00
## .clusterid.fctr707 -5.540e+00
## .clusterid.fctr1101 -1.467e+00
## .clusterid.fctr1102 -3.126e-01
## .clusterid.fctr1103 -6.935e-01
## .clusterid.fctr1104 -1.784e+00
## .clusterid.fctr1105 -2.214e+00
## .clusterid.fctr1106 -1.955e+00
## .clusterid.fctr1107 -3.631e+00
## .clusterid.fctr1108 -1.829e+00
## .clusterid.fctr1109 1.994e+01
## .clusterid.fctr1501 -4.934e+00
## .clusterid.fctr1502 -5.600e+00
## .clusterid.fctr1503 -4.238e+00
## .clusterid.fctr1504 -6.270e+00
## .clusterid.fctr1505 -5.507e+00
## .clusterid.fctr1506 -4.015e+00
## .clusterid.fctr1507 -5.250e+00
## .clusterid.fctr1508 -5.097e+00
## .clusterid.fctr1509 -4.687e+00
## .clusterid.fctr1510 -6.725e+00
## .clusterid.fctr1511 -2.347e+01
## .clusterid.fctr1512 -4.726e+00
## .clusterid.fctr1513 -5.262e+00
## .clusterid.fctr1514 -2.270e+01
## .clusterid.fctr1515 -7.472e+00
## .clusterid.fctr1516 -4.174e+00
## .clusterid.fctr1517 -3.743e+00
## .clusterid.fctr1518 -6.786e+00
## .clusterid.fctr1519 -5.709e+00
## .clusterid.fctr1520 -1.793e+01
## .clusterid.fctr1521 -4.698e+00
## .clusterid.fctr1522 -5.252e+00
## .clusterid.fctr1523 -2.370e+01
## .clusterid.fctr1524 -4.598e+00
## .clusterid.fctr1801 -3.622e+00
## .clusterid.fctr1802 -2.989e+00
## .clusterid.fctr1803 -2.703e+00
## .clusterid.fctr1804 -2.637e+00
## H.sum.TfIdf -1.391e-01
## S.sum.TfIdf -1.966e+01
## A.sum.TfIdf 1.951e+01
## `PubDate.hour.fctr(7.67,15.3]` 2.423e-01
## `PubDate.hour.fctr(15.3,23]` 4.409e-01
## H.npnct19.log 1.529e+00
## A.ratio.nstopwrds.nwrds 3.654e+04
## S.ratio.nstopwrds.nwrds -3.654e+04
## PubDate.wkend -3.671e-01
## H.P.recap.colon 1.680e+00
## H.P.quandary 2.417e+01
## H.P.no.comment.colon 1.917e+00
## S.npnct19.log 1.459e+00
## H.P.facts.figures 2.772e-01
## H.npnct08.log 1.295e+00
## PubDate.last10.log 2.447e-01
## PubDate.last1.log -5.323e-02
## H.P.readers.respond 6.916e+00
## A.T.make -1.601e+04
## S.T.make 1.599e+04
## H.ratio.nstopwrds.nwrds 4.481e+00
## H.T.get 3.268e-01
## H.npnct06.log 1.302e+00
## S.npnct01.log 2.204e+00
## A.T.can 1.353e+03
## H.npnct16.log -7.201e-01
## S.T.can -1.355e+03
## H.T.ebola -7.220e-02
## H.npnct01.log -1.278e+00
## A.T.said -4.738e+03
## S.T.said 4.739e+03
## H.T.make -2.348e-01
## H.npnct11.log 4.810e-01
## `myCategory.fctrForeign#World#Asia Pacific` -6.389e+00
## `myCategory.fctr#Multimedia#` -6.649e+00
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther -2.507e+01
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` -1.297e+00
## `myCategory.fctrTStyle##` -7.023e+00
## `myCategory.fctrForeign#World#` -2.346e+01
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` -2.828e+01
## `myCategory.fctr#Opinion#Room For Debate` -9.238e+00
## `myCategory.fctr#U.S.#Education` -5.964e+01
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` -4.092e+00
## `myCategory.fctrBusiness#Business Day#Small Business` -6.895e+00
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` -6.296e+00
## `myCategory.fctr#Opinion#The Public Editor` NA
## A.T.one 2.176e+04
## S.T.one -2.174e+04
## H.P.s.notebook -1.864e+01
## H.T.take -4.173e-01
## S.npnct16.log -1.133e+00
## A.T.presid 6.046e+03
## S.T.presid -6.046e+03
## S.npnct08.log 1.165e+00
## PubDate.last100.log 1.004e-02
## .rnorm -7.635e-02
## H.npnct05.log -2.850e+01
## H.P.friday.night.music -1.968e+00
## H.T.say -5.721e-01
## H.T.obama -1.884e-01
## H.T.bank -9.352e-02
## `PubDate.date.fctr(7,13]` -4.197e-03
## `PubDate.date.fctr(13,19]` -5.789e-02
## `PubDate.date.fctr(19,25]` -5.722e-02
## `PubDate.date.fctr(25,31]` 1.539e-01
## `PubDate.second.fctr(14.8,29.5]` 7.817e-02
## `PubDate.second.fctr(29.5,44.2]` -5.376e-02
## `PubDate.second.fctr(44.2,59.1]` -2.640e-01
## H.npnct07.log 2.425e-01
## S.npnct07.log -3.700e+01
## S.npnct03.log -3.427e+01
## A.npnct18.log 1.582e+01
## H.npnct12.log 2.769e-01
## H.T.word 2.274e+00
## H.T.big -2.396e-01
## A.npnct02.log -2.014e+01
## A.npnct17.log 5.237e+02
## S.P.year.colon 3.563e+00
## S.T.obama -4.978e+03
## A.T.obama 4.977e+03
## S.npnct20.log -3.063e+01
## H.npnct02.log -2.094e+01
## H.T.test -2.138e-01
## H.P.on.this.day -1.675e+01
## S.P.first.draft -1.784e+01
## S.T.take 2.188e+03
## A.T.take -2.192e+03
## S.npnct06.log -1.890e-01
## A.npnct14.log 1.323e+00
## S.T.time 3.802e+04
## A.T.time -3.806e+04
## H.T.newyorktim -1.987e-01
## H.npnct13.log -3.316e-01
## H.T.deal -2.596e+01
## S.T.new -1.134e+04
## A.T.new 1.134e+04
## H.T.billion 1.266e+00
## S.P.metropolitan.diary.colon -9.352e+00
## H.T.polit -6.575e-01
## H.P.verbatim.colon -1.632e+01
## H.T.china -7.508e-01
## H.T.art -1.176e+00
## `PubDate.minute.fctr(14.8,29.5]` -1.557e-01
## `PubDate.minute.fctr(29.5,44.2]` -1.958e-01
## `PubDate.minute.fctr(44.2,59.1]` 8.618e-02
## H.T.read -9.421e-01
## S.npnct12.log -1.849e-01
## H.P.today.in.politic 2.225e-02
## A.T.year 7.411e+03
## S.T.year -7.412e+03
## H.P.what.we.are -1.895e+01
## A.T.will 3.612e+03
## S.T.will -3.609e+03
## A.T.appear -4.878e-01
## S.T.appear NA
## PubDate.wkday.fctr1 -4.453e-01
## PubDate.wkday.fctr2 -1.073e+00
## PubDate.wkday.fctr3 -5.932e-01
## PubDate.wkday.fctr4 -8.257e-01
## PubDate.wkday.fctr5 -7.368e-01
## PubDate.wkday.fctr6 -1.223e+00
## H.T.pictur 5.234e-02
## H.T.new -5.843e-01
## A.T.senat 1.883e+03
## S.T.senat -1.881e+03
## S.T.show 1.059e+04
## A.T.show -1.059e+04
## H.P.today.in.smallbusiness -1.673e+01
## S.T.day -2.038e+03
## A.T.day 2.041e+03
## S.npnct28.log 6.244e+01
## A.npnct28.log -9.998e+01
## H.P.daily.clip.report 2.802e+01
## H.T.clip NA
## A.T.first 1.071e+03
## H.T.news -8.111e-01
## S.T.first -1.070e+03
## H.T.first -6.870e-01
## H.T.X2014 -8.966e-01
## A.T.newyork 3.661e+04
## S.T.newyork -3.661e+04
## A.T.report -2.683e+04
## A.T.compani 5.161e+03
## S.T.report 2.683e+04
## S.T.compani -5.162e+03
## A.T.word -6.670e+03
## S.T.word 6.669e+03
## H.T.morn 1.509e+01
## H.T.busi -3.817e-01
## A.T.newyorktim -8.282e+03
## S.T.newyorktim 8.269e+03
## A.npnct13.log 1.224e+02
## A.T.share -1.894e+00
## S.T.share NA
## H.npnct04.log -2.543e+00
## S.npnct13.log -1.214e+02
## A.T.articl -8.969e+03
## S.T.articl 8.965e+03
## H.T.newyork -5.177e-01
## H.T.today -7.413e-01
## H.T.springsumm 3.356e+01
## H.T.day -4.960e-01
## H.npnct14.log -2.533e+01
## A.T.diari 2.100e+01
## S.T.diari NA
## H.T.report -1.068e+00
## S.npnct04.log -9.429e-01
## H.T.daili -2.585e+01
## H.T.X2015 -3.120e+01
## A.T.herald 7.486e+01
## S.T.herald NA
## S.npnct15.log 3.256e-01
## H.T.week -8.756e-01
## A.T.photo -2.389e+00
## S.T.photo NA
## A.T.intern 2.814e+02
## S.T.intern -2.844e+02
## A.T.tribun -5.541e+01
## S.T.tribun NA
## S.P.fashion.week 3.637e+00
## A.T.archiv -5.374e+01
## S.T.archiv NA
## H.P.fashion.week -1.776e+01
## H.P.year.colon -1.576e+01
## H.T.fashion 2.608e+00
## H.npnct15.log -1.566e+00
## A.T.fashion -4.174e+04
## S.T.fashion 4.167e+04
## A.T.week 1.199e+03
## S.T.week -1.199e+03
## H.nstopwrds.log -1.191e+00
## H.npnct28.log -1.556e+00
## S.npnct11.log -8.572e-02
## S.nstopwrds.log 1.041e+04
## A.nstopwrds.log -1.041e+04
## H.ndgts.log 9.027e-01
## S.ndgts.log 2.029e-01
## H.nuppr.log 1.990e+00
## H.nwrds.log 9.904e-01
## H.nchrs.log -9.879e-01
## S.nwrds.log -7.548e+03
## A.nwrds.log 7.548e+03
## H.nwrds.unq.log -1.109e+00
## S.nchrs.log -2.317e-02
## A.nwrds.unq.log 3.673e+03
## S.nwrds.unq.log -3.674e+03
## S.nuppr.log -5.258e-01
## Std. Error z value
## (Intercept) 5.482e+00 -0.586
## WordCount.log 1.159e-01 11.501
## A.ratio.sum.TfIdf.nwrds 2.238e+02 -0.821
## S.ratio.sum.TfIdf.nwrds 2.238e+02 0.827
## H.ratio.sum.TfIdf.nwrds 2.028e-01 1.073
## .clusterid.fctr101 1.638e+00 -1.251
## .clusterid.fctr102 1.667e+00 -1.484
## .clusterid.fctr103 1.773e+00 -1.415
## .clusterid.fctr104 1.806e+00 -1.368
## .clusterid.fctr401 2.093e+00 -3.130
## .clusterid.fctr402 6.928e+03 -0.004
## .clusterid.fctr403 1.997e+00 -2.550
## .clusterid.fctr404 1.787e+00 -2.707
## .clusterid.fctr405 1.710e+00 -2.444
## .clusterid.fctr406 1.956e+00 -2.832
## .clusterid.fctr407 6.879e+03 -0.004
## .clusterid.fctr408 2.093e+00 -3.190
## .clusterid.fctr409 2.088e+00 -3.094
## .clusterid.fctr410 9.364e+03 -0.003
## .clusterid.fctr411 1.107e+04 -0.002
## .clusterid.fctr412 1.023e+04 -0.002
## .clusterid.fctr413 1.210e+04 -0.002
## .clusterid.fctr414 1.891e+00 -1.608
## .clusterid.fctr415 6.817e+02 -0.027
## .clusterid.fctr501 1.635e+00 -2.643
## .clusterid.fctr502 1.708e+00 -3.445
## .clusterid.fctr503 1.748e+00 -3.418
## .clusterid.fctr504 1.774e+00 -3.562
## .clusterid.fctr505 1.985e+00 -2.623
## .clusterid.fctr506 1.688e+00 -3.221
## .clusterid.fctr507 1.744e+00 -3.224
## .clusterid.fctr508 1.953e+00 -3.042
## .clusterid.fctr509 1.797e+00 -3.061
## .clusterid.fctr510 1.769e+00 -2.639
## .clusterid.fctr511 1.960e+00 -3.091
## .clusterid.fctr512 1.778e+00 -2.670
## .clusterid.fctr513 1.798e+00 -2.777
## .clusterid.fctr701 1.674e+00 -2.523
## .clusterid.fctr702 1.693e+00 -2.557
## .clusterid.fctr703 6.785e+03 -0.003
## .clusterid.fctr704 1.703e+00 -2.832
## .clusterid.fctr705 1.724e+00 -2.863
## .clusterid.fctr706 1.668e+00 -2.023
## .clusterid.fctr707 1.997e+00 -2.774
## .clusterid.fctr1101 1.628e+00 -0.901
## .clusterid.fctr1102 1.690e+00 -0.185
## .clusterid.fctr1103 1.808e+00 -0.383
## .clusterid.fctr1104 1.681e+00 -1.061
## .clusterid.fctr1105 1.691e+00 -1.309
## .clusterid.fctr1106 1.757e+00 -1.113
## .clusterid.fctr1107 1.715e+00 -2.117
## .clusterid.fctr1108 1.724e+00 -1.061
## .clusterid.fctr1109 1.327e+04 0.002
## .clusterid.fctr1501 1.692e+00 -2.916
## .clusterid.fctr1502 1.717e+00 -3.262
## .clusterid.fctr1503 1.666e+00 -2.544
## .clusterid.fctr1504 1.746e+00 -3.592
## .clusterid.fctr1505 1.718e+00 -3.205
## .clusterid.fctr1506 1.901e+00 -2.113
## .clusterid.fctr1507 1.855e+00 -2.829
## .clusterid.fctr1508 1.863e+00 -2.736
## .clusterid.fctr1509 1.697e+00 -2.762
## .clusterid.fctr1510 2.054e+00 -3.274
## .clusterid.fctr1511 3.149e+04 -0.001
## .clusterid.fctr1512 1.811e+00 -2.610
## .clusterid.fctr1513 1.968e+00 -2.674
## .clusterid.fctr1514 7.194e+03 -0.003
## .clusterid.fctr1515 2.030e+00 -3.681
## .clusterid.fctr1516 1.713e+00 -2.436
## .clusterid.fctr1517 1.774e+00 -2.110
## .clusterid.fctr1518 1.996e+00 -3.399
## .clusterid.fctr1519 1.768e+00 -3.230
## .clusterid.fctr1520 8.749e+03 -0.002
## .clusterid.fctr1521 1.776e+00 -2.645
## .clusterid.fctr1522 1.796e+00 -2.925
## .clusterid.fctr1523 1.099e+04 -0.002
## .clusterid.fctr1524 2.615e+00 -1.758
## .clusterid.fctr1801 1.642e+00 -2.205
## .clusterid.fctr1802 1.683e+00 -1.775
## .clusterid.fctr1803 1.758e+00 -1.537
## .clusterid.fctr1804 2.243e+00 -1.175
## H.sum.TfIdf 1.019e-01 -1.365
## S.sum.TfIdf 1.644e+01 -1.196
## A.sum.TfIdf 1.645e+01 1.186
## `PubDate.hour.fctr(7.67,15.3]` 2.711e-01 0.894
## `PubDate.hour.fctr(15.3,23]` 2.781e-01 1.586
## H.npnct19.log 3.701e-01 4.132
## A.ratio.nstopwrds.nwrds 7.528e+06 0.005
## S.ratio.nstopwrds.nwrds 7.528e+06 -0.005
## PubDate.wkend 4.826e-01 -0.761
## H.P.recap.colon 1.235e+00 1.360
## H.P.quandary 1.864e+04 0.001
## H.P.no.comment.colon 1.124e+00 1.705
## S.npnct19.log 4.283e-01 3.407
## H.P.facts.figures 1.662e+00 0.167
## H.npnct08.log 5.095e-01 2.543
## PubDate.last10.log 1.340e-01 1.825
## PubDate.last1.log 4.963e-02 -1.073
## H.P.readers.respond 1.151e+00 6.007
## A.T.make 4.840e+06 -0.003
## S.T.make 4.835e+06 0.003
## H.ratio.nstopwrds.nwrds 2.957e+00 1.516
## H.T.get 4.350e-01 0.751
## H.npnct06.log 1.205e+00 1.080
## S.npnct01.log 2.182e+00 1.010
## A.T.can 1.822e+05 0.007
## H.npnct16.log 1.217e+00 -0.592
## S.T.can 1.822e+05 -0.007
## H.T.ebola 3.376e-01 -0.214
## H.npnct01.log 1.367e+00 -0.935
## A.T.said 1.426e+06 -0.003
## S.T.said 1.426e+06 0.003
## H.T.make 4.140e-01 -0.567
## H.npnct11.log 2.323e-01 2.071
## `myCategory.fctrForeign#World#Asia Pacific` 1.743e+00 -3.666
## `myCategory.fctr#Multimedia#` 1.790e+00 -3.713
## `myCategory.fctrCulture#Arts#` NA NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA NA
## myCategory.fctrmyOther 7.823e+03 -0.003
## `myCategory.fctrBusiness#Technology#` NA NA
## `myCategory.fctrBusiness#Crosswords/Games#` 1.685e+00 -0.770
## `myCategory.fctrTStyle##` 1.687e+00 -4.163
## `myCategory.fctrForeign#World#` 4.800e+03 -0.005
## `myCategory.fctrOpEd#Opinion#` NA NA
## `myCategory.fctrStyles##Fashion` 4.114e+03 -0.007
## `myCategory.fctr#Opinion#Room For Debate` 1.822e+00 -5.071
## `myCategory.fctr#U.S.#Education` 6.359e+03 -0.009
## `myCategory.fctr##` NA NA
## `myCategory.fctrMetro#N.Y. / Region#` 1.695e+00 -2.414
## `myCategory.fctrBusiness#Business Day#Small Business` 1.749e+00 -3.942
## `myCategory.fctrStyles#U.S.#` NA NA
## `myCategory.fctrTravel#Travel#` 1.916e+00 -3.286
## `myCategory.fctr#Opinion#The Public Editor` NA NA
## A.T.one 1.886e+06 0.012
## S.T.one 1.884e+06 -0.012
## H.P.s.notebook 2.298e+04 -0.001
## H.T.take 5.116e-01 -0.816
## S.npnct16.log 1.442e+00 -0.786
## A.T.presid 8.654e+05 0.007
## S.T.presid 8.654e+05 -0.007
## S.npnct08.log 7.361e-01 1.583
## PubDate.last100.log 4.850e-02 0.207
## .rnorm 6.984e-02 -1.093
## H.npnct05.log 2.677e+04 -0.001
## H.P.friday.night.music 1.358e+00 -1.449
## H.T.say 4.668e-01 -1.226
## H.T.obama 4.766e-01 -0.395
## H.T.bank 5.252e-01 -0.178
## `PubDate.date.fctr(7,13]` 2.178e-01 -0.019
## `PubDate.date.fctr(13,19]` 2.165e-01 -0.267
## `PubDate.date.fctr(19,25]` 2.109e-01 -0.271
## `PubDate.date.fctr(25,31]` 2.286e-01 0.673
## `PubDate.second.fctr(14.8,29.5]` 1.930e-01 0.405
## `PubDate.second.fctr(29.5,44.2]` 1.892e-01 -0.284
## `PubDate.second.fctr(44.2,59.1]` 1.959e-01 -1.347
## H.npnct07.log 2.341e-01 1.035
## S.npnct07.log 2.535e+04 -0.001
## S.npnct03.log 2.350e+04 -0.001
## A.npnct18.log 1.447e+05 0.000
## H.npnct12.log 3.562e-01 0.777
## H.T.word 1.076e+00 2.114
## H.T.big 6.323e-01 -0.379
## A.npnct02.log 4.216e+04 0.000
## A.npnct17.log 7.742e+04 0.007
## S.P.year.colon 1.190e+04 0.000
## S.T.obama 5.516e+05 -0.009
## A.T.obama 5.516e+05 0.009
## S.npnct20.log 1.925e+04 -0.002
## H.npnct02.log 1.341e+04 -0.002
## H.T.test 7.335e-01 -0.291
## H.P.on.this.day 1.508e+04 -0.001
## S.P.first.draft 1.111e+04 -0.002
## S.T.take 7.982e+06 0.000
## A.T.take 7.991e+06 0.000
## S.npnct06.log 1.827e+00 -0.103
## A.npnct14.log 1.772e+00 0.747
## S.T.time 4.654e+06 0.008
## A.T.time 4.659e+06 -0.008
## H.T.newyorktim 8.269e-01 -0.240
## H.npnct13.log 2.513e-01 -1.320
## H.T.deal 7.082e+03 -0.004
## S.T.new 3.591e+06 -0.003
## A.T.new 3.593e+06 0.003
## H.T.billion 1.276e+00 0.992
## S.P.metropolitan.diary.colon 4.030e+00 -2.320
## H.T.polit 4.608e-01 -1.427
## H.P.verbatim.colon 1.016e+04 -0.002
## H.T.china 1.020e+00 -0.736
## H.T.art 1.072e+00 -1.098
## `PubDate.minute.fctr(14.8,29.5]` 2.031e-01 -0.767
## `PubDate.minute.fctr(29.5,44.2]` 1.976e-01 -0.991
## `PubDate.minute.fctr(44.2,59.1]` 2.037e-01 0.423
## H.T.read 6.002e-01 -1.570
## S.npnct12.log 2.357e-01 -0.785
## H.P.today.in.politic 3.268e+04 0.000
## A.T.year 4.105e+06 0.002
## S.T.year 4.105e+06 -0.002
## H.P.what.we.are 7.795e+03 -0.002
## A.T.will 2.948e+05 0.012
## S.T.will 2.945e+05 -0.012
## A.T.appear 1.314e+00 -0.371
## S.T.appear NA NA
## PubDate.wkday.fctr1 5.801e-01 -0.768
## PubDate.wkday.fctr2 6.309e-01 -1.701
## PubDate.wkday.fctr3 6.239e-01 -0.951
## PubDate.wkday.fctr4 6.141e-01 -1.345
## PubDate.wkday.fctr5 6.233e-01 -1.182
## PubDate.wkday.fctr6 5.668e-01 -2.158
## H.T.pictur 7.061e-01 0.074
## H.T.new 5.577e-01 -1.048
## A.T.senat 3.423e+05 0.006
## S.T.senat 3.419e+05 -0.006
## S.T.show 6.401e+06 0.002
## A.T.show 6.401e+06 -0.002
## H.P.today.in.smallbusiness 7.687e+03 -0.002
## S.T.day 8.755e+06 0.000
## A.T.day 8.771e+06 0.000
## S.npnct28.log 2.017e+05 0.000
## A.npnct28.log 2.003e+05 0.000
## H.P.daily.clip.report 1.523e+04 0.002
## H.T.clip NA NA
## A.T.first 3.683e+06 0.000
## H.T.news 8.680e-01 -0.934
## S.T.first 3.683e+06 0.000
## H.T.first 1.224e+00 -0.561
## H.T.X2014 1.167e+00 -0.768
## A.T.newyork 4.365e+06 0.008
## S.T.newyork 4.365e+06 -0.008
## A.T.report 2.496e+06 -0.011
## A.T.compani 1.348e+06 0.004
## S.T.report 2.496e+06 0.011
## S.T.compani 1.348e+06 -0.004
## A.T.word 5.650e+06 -0.001
## S.T.word 5.650e+06 0.001
## H.T.morn 7.617e+03 0.002
## H.T.busi 7.040e-01 -0.542
## A.T.newyorktim 5.266e+06 -0.002
## S.T.newyorktim 5.257e+06 0.002
## A.npnct13.log 1.586e+04 0.008
## A.T.share 1.162e+00 -1.629
## S.T.share NA NA
## H.npnct04.log 1.500e+00 -1.695
## S.npnct13.log 1.586e+04 -0.008
## A.T.articl 5.120e+06 -0.002
## S.T.articl 5.120e+06 0.002
## H.T.newyork 5.929e-01 -0.873
## H.T.today 8.023e-01 -0.924
## H.T.springsumm 1.419e+04 0.002
## H.T.day 7.305e-01 -0.679
## H.npnct14.log 1.347e+04 -0.002
## A.T.diari 8.494e+00 2.472
## S.T.diari NA NA
## H.T.report 9.528e-01 -1.121
## S.npnct04.log 7.943e-01 -1.187
## H.T.daili 7.644e+03 -0.003
## H.T.X2015 1.135e+04 -0.003
## A.T.herald 1.160e+04 0.006
## S.T.herald NA NA
## S.npnct15.log 6.115e-01 0.532
## H.T.week 7.597e-01 -1.153
## A.T.photo 2.203e+00 -1.085
## S.T.photo NA NA
## A.T.intern 8.986e+05 0.000
## S.T.intern 8.986e+05 0.000
## A.T.tribun 1.206e+04 -0.005
## S.T.tribun NA NA
## S.P.fashion.week 3.234e+03 0.001
## A.T.archiv 1.095e+04 -0.005
## S.T.archiv NA NA
## H.P.fashion.week 2.546e+03 -0.007
## H.P.year.colon 4.363e+03 -0.004
## H.T.fashion 1.821e+00 1.432
## H.npnct15.log 4.168e-01 -3.756
## A.T.fashion 9.075e+06 -0.005
## S.T.fashion 9.075e+06 0.005
## A.T.week 1.998e+06 0.001
## S.T.week 1.998e+06 -0.001
## H.nstopwrds.log 6.633e-01 -1.796
## H.npnct28.log 1.975e+00 -0.788
## S.npnct11.log 1.802e-01 -0.476
## S.nstopwrds.log 2.673e+06 0.004
## A.nstopwrds.log 2.673e+06 -0.004
## H.ndgts.log 3.194e-01 2.826
## S.ndgts.log 2.308e-01 0.879
## H.nuppr.log 7.649e-01 2.602
## H.nwrds.log 1.108e+00 0.894
## H.nchrs.log 6.082e-01 -1.624
## S.nwrds.log 2.482e+06 -0.003
## A.nwrds.log 2.482e+06 0.003
## H.nwrds.unq.log 5.239e-01 -2.117
## S.nchrs.log 9.100e-01 -0.025
## A.nwrds.unq.log 3.249e+05 0.011
## S.nwrds.unq.log 3.249e+05 -0.011
## S.nuppr.log 1.890e-01 -2.782
## Pr(>|z|)
## (Intercept) 0.557954
## WordCount.log < 2e-16 ***
## A.ratio.sum.TfIdf.nwrds 0.411722
## S.ratio.sum.TfIdf.nwrds 0.408053
## H.ratio.sum.TfIdf.nwrds 0.283200
## .clusterid.fctr101 0.211104
## .clusterid.fctr102 0.137932
## .clusterid.fctr103 0.156939
## .clusterid.fctr104 0.171245
## .clusterid.fctr401 0.001750 **
## .clusterid.fctr402 0.997129
## .clusterid.fctr403 0.010768 *
## .clusterid.fctr404 0.006798 **
## .clusterid.fctr405 0.014528 *
## .clusterid.fctr406 0.004630 **
## .clusterid.fctr407 0.997156
## .clusterid.fctr408 0.001424 **
## .clusterid.fctr409 0.001975 **
## .clusterid.fctr410 0.997891
## .clusterid.fctr411 0.998270
## .clusterid.fctr412 0.998072
## .clusterid.fctr413 0.998334
## .clusterid.fctr414 0.107911
## .clusterid.fctr415 0.978155
## .clusterid.fctr501 0.008213 **
## .clusterid.fctr502 0.000572 ***
## .clusterid.fctr503 0.000632 ***
## .clusterid.fctr504 0.000368 ***
## .clusterid.fctr505 0.008710 **
## .clusterid.fctr506 0.001279 **
## .clusterid.fctr507 0.001264 **
## .clusterid.fctr508 0.002347 **
## .clusterid.fctr509 0.002204 **
## .clusterid.fctr510 0.008308 **
## .clusterid.fctr511 0.001996 **
## .clusterid.fctr512 0.007585 **
## .clusterid.fctr513 0.005492 **
## .clusterid.fctr701 0.011647 *
## .clusterid.fctr702 0.010557 *
## .clusterid.fctr703 0.997285
## .clusterid.fctr704 0.004623 **
## .clusterid.fctr705 0.004196 **
## .clusterid.fctr706 0.043054 *
## .clusterid.fctr707 0.005537 **
## .clusterid.fctr1101 0.367501
## .clusterid.fctr1102 0.853246
## .clusterid.fctr1103 0.701387
## .clusterid.fctr1104 0.288601
## .clusterid.fctr1105 0.190470
## .clusterid.fctr1106 0.265850
## .clusterid.fctr1107 0.034237 *
## .clusterid.fctr1108 0.288818
## .clusterid.fctr1109 0.998801
## .clusterid.fctr1501 0.003540 **
## .clusterid.fctr1502 0.001107 **
## .clusterid.fctr1503 0.010948 *
## .clusterid.fctr1504 0.000328 ***
## .clusterid.fctr1505 0.001350 **
## .clusterid.fctr1506 0.034635 *
## .clusterid.fctr1507 0.004662 **
## .clusterid.fctr1508 0.006220 **
## .clusterid.fctr1509 0.005747 **
## .clusterid.fctr1510 0.001060 **
## .clusterid.fctr1511 0.999405
## .clusterid.fctr1512 0.009063 **
## .clusterid.fctr1513 0.007489 **
## .clusterid.fctr1514 0.997482
## .clusterid.fctr1515 0.000233 ***
## .clusterid.fctr1516 0.014837 *
## .clusterid.fctr1517 0.034846 *
## .clusterid.fctr1518 0.000676 ***
## .clusterid.fctr1519 0.001239 **
## .clusterid.fctr1520 0.998364
## .clusterid.fctr1521 0.008163 **
## .clusterid.fctr1522 0.003442 **
## .clusterid.fctr1523 0.998279
## .clusterid.fctr1524 0.078678 .
## .clusterid.fctr1801 0.027420 *
## .clusterid.fctr1802 0.075837 .
## .clusterid.fctr1803 0.124228
## .clusterid.fctr1804 0.239840
## H.sum.TfIdf 0.172254
## S.sum.TfIdf 0.231790
## A.sum.TfIdf 0.235514
## `PubDate.hour.fctr(7.67,15.3]` 0.371433
## `PubDate.hour.fctr(15.3,23]` 0.112849
## H.npnct19.log 3.59e-05 ***
## A.ratio.nstopwrds.nwrds 0.996128
## S.ratio.nstopwrds.nwrds 0.996127
## PubDate.wkend 0.446778
## H.P.recap.colon 0.173699
## H.P.quandary 0.998966
## H.P.no.comment.colon 0.088134 .
## S.npnct19.log 0.000656 ***
## H.P.facts.figures 0.867537
## H.npnct08.log 0.011000 *
## PubDate.last10.log 0.067939 .
## PubDate.last1.log 0.283461
## H.P.readers.respond 1.89e-09 ***
## A.T.make 0.997361
## S.T.make 0.997361
## H.ratio.nstopwrds.nwrds 0.129635
## H.T.get 0.452458
## H.npnct06.log 0.279989
## S.npnct01.log 0.312484
## A.T.can 0.994074
## H.npnct16.log 0.554180
## S.T.can 0.994066
## H.T.ebola 0.830656
## H.npnct01.log 0.349932
## A.T.said 0.997350
## S.T.said 0.997349
## H.T.make 0.570689
## H.npnct11.log 0.038355 *
## `myCategory.fctrForeign#World#Asia Pacific` 0.000246 ***
## `myCategory.fctr#Multimedia#` 0.000204 ***
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther 0.997443
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` 0.441463
## `myCategory.fctrTStyle##` 3.14e-05 ***
## `myCategory.fctrForeign#World#` 0.996100
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` 0.994514
## `myCategory.fctr#Opinion#Room For Debate` 3.95e-07 ***
## `myCategory.fctr#U.S.#Education` 0.992517
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` 0.015780 *
## `myCategory.fctrBusiness#Business Day#Small Business` 8.07e-05 ***
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` 0.001015 **
## `myCategory.fctr#Opinion#The Public Editor` NA
## A.T.one 0.990794
## S.T.one 0.990793
## H.P.s.notebook 0.999353
## H.T.take 0.414601
## S.npnct16.log 0.432071
## A.T.presid 0.994425
## S.T.presid 0.994426
## S.npnct08.log 0.113469
## PubDate.last100.log 0.835993
## .rnorm 0.274313
## H.npnct05.log 0.999150
## H.P.friday.night.music 0.147284
## H.T.say 0.220371
## H.T.obama 0.692639
## H.T.bank 0.858662
## `PubDate.date.fctr(7,13]` 0.984626
## `PubDate.date.fctr(13,19]` 0.789222
## `PubDate.date.fctr(19,25]` 0.786188
## `PubDate.date.fctr(25,31]` 0.500710
## `PubDate.second.fctr(14.8,29.5]` 0.685529
## `PubDate.second.fctr(29.5,44.2]` 0.776327
## `PubDate.second.fctr(44.2,59.1]` 0.177873
## H.npnct07.log 0.300444
## S.npnct07.log 0.998835
## S.npnct03.log 0.998836
## A.npnct18.log 0.999913
## H.npnct12.log 0.436967
## H.T.word 0.034481 *
## H.T.big 0.704761
## A.npnct02.log 0.999619
## A.npnct17.log 0.994603
## S.P.year.colon 0.999761
## S.T.obama 0.992801
## A.T.obama 0.992801
## S.npnct20.log 0.998730
## H.npnct02.log 0.998754
## H.T.test 0.770708
## H.P.on.this.day 0.999114
## S.P.first.draft 0.998719
## S.T.take 0.999781
## A.T.take 0.999781
## S.npnct06.log 0.917572
## A.npnct14.log 0.455286
## S.T.time 0.993481
## A.T.time 0.993481
## H.T.newyorktim 0.810128
## H.npnct13.log 0.186986
## H.T.deal 0.997075
## S.T.new 0.997481
## A.T.new 0.997481
## H.T.billion 0.321121
## S.P.metropolitan.diary.colon 0.020324 *
## H.T.polit 0.153583
## H.P.verbatim.colon 0.998718
## H.T.china 0.461768
## H.T.art 0.272345
## `PubDate.minute.fctr(14.8,29.5]` 0.443208
## `PubDate.minute.fctr(29.5,44.2]` 0.321857
## `PubDate.minute.fctr(44.2,59.1]` 0.672289
## H.T.read 0.116480
## S.npnct12.log 0.432709
## H.P.today.in.politic 0.999999
## A.T.year 0.998559
## S.T.year 0.998559
## H.P.what.we.are 0.998060
## A.T.will 0.990226
## S.T.will 0.990224
## A.T.appear 0.710432
## S.T.appear NA
## PubDate.wkday.fctr1 0.442733
## PubDate.wkday.fctr2 0.089008 .
## PubDate.wkday.fctr3 0.341659
## PubDate.wkday.fctr4 0.178733
## PubDate.wkday.fctr5 0.237200
## PubDate.wkday.fctr6 0.030926 *
## H.T.pictur 0.940909
## H.T.new 0.294814
## A.T.senat 0.995609
## S.T.senat 0.995611
## S.T.show 0.998680
## A.T.show 0.998680
## H.P.today.in.smallbusiness 0.998264
## S.T.day 0.999814
## A.T.day 0.999814
## S.npnct28.log 0.999753
## A.npnct28.log 0.999602
## H.P.daily.clip.report 0.998532
## H.T.clip NA
## A.T.first 0.999768
## H.T.news 0.350066
## S.T.first 0.999768
## H.T.first 0.574733
## H.T.X2014 0.442497
## A.T.newyork 0.993308
## S.T.newyork 0.993308
## A.T.report 0.991422
## A.T.compani 0.996946
## S.T.report 0.991422
## S.T.compani 0.996945
## A.T.word 0.999058
## S.T.word 0.999058
## H.T.morn 0.998419
## H.T.busi 0.587734
## A.T.newyorktim 0.998745
## S.T.newyorktim 0.998745
## A.npnct13.log 0.993842
## A.T.share 0.103216
## S.T.share NA
## H.npnct04.log 0.089986 .
## S.npnct13.log 0.993895
## A.T.articl 0.998602
## S.T.articl 0.998603
## H.T.newyork 0.382535
## H.T.today 0.355485
## H.T.springsumm 0.998112
## H.T.day 0.497164
## H.npnct14.log 0.998500
## A.T.diari 0.013420 *
## S.T.diari NA
## H.T.report 0.262349
## S.npnct04.log 0.235208
## H.T.daili 0.997302
## H.T.X2015 0.997806
## A.T.herald 0.994853
## S.T.herald NA
## S.npnct15.log 0.594477
## H.T.week 0.249101
## A.T.photo 0.278130
## S.T.photo NA
## A.T.intern 0.999750
## S.T.intern 0.999747
## A.T.tribun 0.996333
## S.T.tribun NA
## S.P.fashion.week 0.999103
## A.T.archiv 0.996083
## S.T.archiv NA
## H.P.fashion.week 0.994435
## H.P.year.colon 0.997117
## H.T.fashion 0.152210
## H.npnct15.log 0.000172 ***
## A.T.fashion 0.996330
## S.T.fashion 0.996336
## A.T.week 0.999521
## S.T.week 0.999521
## H.nstopwrds.log 0.072525 .
## H.npnct28.log 0.430615
## S.npnct11.log 0.634333
## S.nstopwrds.log 0.996893
## A.nstopwrds.log 0.996894
## H.ndgts.log 0.004708 **
## S.ndgts.log 0.379364
## H.nuppr.log 0.009272 **
## H.nwrds.log 0.371416
## H.nchrs.log 0.104286
## S.nwrds.log 0.997574
## A.nwrds.log 0.997574
## H.nwrds.unq.log 0.034301 *
## S.nchrs.log 0.979688
## A.nwrds.unq.log 0.990981
## S.nwrds.unq.log 0.990978
## S.nuppr.log 0.005397 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 1582.9 on 4196 degrees of freedom
## AIC: 2140.9
##
## Number of Fisher Scoring iterations: 21
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.6937198
## 3 0.2 0.7545249
## 4 0.3 0.7786733
## 5 0.4 0.7799331
## 6 0.5 0.7746979
## 7 0.6 0.7479308
## 8 0.7 0.7034373
## 9 0.8 0.6259808
## 10 0.9 0.5014691
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.All.X.glm.N
## 1 N 3563
## 2 Y 166
## Popular.fctr.predict.All.X.glm.Y
## 1 163
## 2 583
## Prediction
## Reference N Y
## N 3563 163
## Y 166 583
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.264804e-01 7.358019e-01 9.184402e-01 9.339586e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 6.383233e-77 9.122003e-01
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.6460554
## 3 0.2 0.7002519
## 4 0.3 0.7173913
## 5 0.4 0.7142857
## 6 0.5 0.6913580
## 7 0.6 0.6688742
## 8 0.7 0.6312057
## 9 0.8 0.5555556
## 10 0.9 0.4392324
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.All.X.glm.N
## 1 N 1585
## 2 Y 80
## Popular.fctr.predict.All.X.glm.Y
## 1 128
## 2 264
## Prediction
## Reference N Y
## N 1585 128
## Y 80 264
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.988819e-01 6.561351e-01 8.850365e-01 9.115781e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 8.234641e-18 1.118594e-03
## model_id model_method
## 1 All.X.glm glm
## feats
## 1 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 28.039 14.339
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.9641175 0.4 0.7799331 0.890503
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.9184402 0.9339586 0.5963536 0.9159268
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.7173913 0.8988819
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.8850365 0.9115781 0.6561351 2140.884
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008743821 0.0323972
## label step_major step_minor bgn end elapsed
## 2 fit.models_1_glm 2 0 430.059 466.393 36.334
## 3 fit.models_1_rpart 3 0 466.394 NA NA
## [1] "fitting model: All.X.no.rnorm.rpart"
## [1] " indep_vars: WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log"
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0214 on full training set
## Warning in myfit_mdl(model_id = model_id, model_method = method,
## indep_vars_vctr = indep_vars_vctr, : model's bestTune found at an extreme
## of tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0.27102804 0 1.0000000
## 2 0.08411215 1 0.7289720
## 3 0.02136182 2 0.6448598
##
## Variable importance
## myCategory.fctrOpEd#Opinion#
## 46
## myCategory.fctrBusiness#Crosswords/Games#
## 15
## .clusterid.fctr1101
## 11
## .clusterid.fctr1102
## 8
## .clusterid.fctr1103
## 6
## A.nwrds.unq.log
## 6
## S.nwrds.unq.log
## 6
## H.nchrs.log
## 1
##
## Node number 1: 4475 observations, complexity param=0.271028
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
## left son=2 (4106 obs) right son=3 (369 obs)
## Primary splits:
## myCategory.fctrOpEd#Opinion# < 0.5 to the left, improve=297.02950, (0 missing)
## WordCount.log < 6.524296 to the left, improve=105.72630, (0 missing)
## S.nuppr.log < 1.497866 to the right, improve= 86.35796, (0 missing)
## myCategory.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve= 85.77765, (0 missing)
## A.ratio.sum.TfIdf.nwrds < 1.155339 to the left, improve= 81.74534, (0 missing)
## Surrogate splits:
## .clusterid.fctr1101 < 0.5 to the left, agree=0.938, adj=0.244, (0 split)
## .clusterid.fctr1102 < 0.5 to the left, agree=0.931, adj=0.165, (0 split)
## .clusterid.fctr1103 < 0.5 to the left, agree=0.928, adj=0.133, (0 split)
## A.nwrds.unq.log < 1.497866 to the right, agree=0.928, adj=0.125, (0 split)
## S.nwrds.unq.log < 1.497866 to the right, agree=0.928, adj=0.122, (0 split)
##
## Node number 2: 4106 observations, complexity param=0.08411215
## predicted class=N expected loss=0.1127618 P(node) =0.9175419
## class counts: 3643 463
## probabilities: 0.887 0.113
## left son=4 (4023 obs) right son=5 (83 obs)
## Primary splits:
## myCategory.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=99.60741, (0 missing)
## WordCount.log < 6.485398 to the left, improve=94.68604, (0 missing)
## myCategory.fctrStyles#U.S.# < 0.5 to the left, improve=50.94648, (0 missing)
## .clusterid.fctr101 < 0.5 to the left, improve=44.64818, (0 missing)
## S.nuppr.log < 1.497866 to the right, improve=31.44556, (0 missing)
## Surrogate splits:
## H.nchrs.log < 2.35024 to the right, agree=0.981, adj=0.060, (0 split)
## H.ratio.sum.TfIdf.nwrds < 6.393259 to the left, agree=0.980, adj=0.024, (0 split)
## H.nuppr.log < 0.8958797 to the right, agree=0.980, adj=0.024, (0 split)
## H.nwrds.log < 0.8958797 to the right, agree=0.980, adj=0.024, (0 split)
## A.T.make < 0.9262225 to the left, agree=0.980, adj=0.012, (0 split)
##
## Node number 3: 369 observations
## predicted class=Y expected loss=0.2249322 P(node) =0.0824581
## class counts: 83 286
## probabilities: 0.225 0.775
##
## Node number 4: 4023 observations
## predicted class=N expected loss=0.09694258 P(node) =0.8989944
## class counts: 3633 390
## probabilities: 0.903 0.097
##
## Node number 5: 83 observations
## predicted class=Y expected loss=0.1204819 P(node) =0.01854749
## class counts: 10 73
## probabilities: 0.120 0.880
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.83262570 0.16737430)
## 2) myCategory.fctrOpEd#Opinion#< 0.5 4106 463 N (0.88723819 0.11276181)
## 4) myCategory.fctrBusiness#Crosswords/Games#< 0.5 4023 390 N (0.90305742 0.09694258) *
## 5) myCategory.fctrBusiness#Crosswords/Games#>=0.5 83 10 Y (0.12048193 0.87951807) *
## 3) myCategory.fctrOpEd#Opinion#>=0.5 369 83 Y (0.22493225 0.77506775) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.5978351
## 3 0.2 0.5978351
## 4 0.3 0.5978351
## 5 0.4 0.5978351
## 6 0.5 0.5978351
## 7 0.6 0.5978351
## 8 0.7 0.5978351
## 9 0.8 0.1754808
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.7000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.All.X.no.rnorm.rpart.N
## 1 N 3633
## 2 Y 390
## Popular.fctr.predict.All.X.no.rnorm.rpart.Y
## 1 93
## 2 359
## Prediction
## Reference N Y
## N 3633 93
## Y 390 359
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.920670e-01 5.398657e-01 8.826068e-01 9.010121e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 1.439953e-29 2.397951e-41
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.5650558
## 3 0.2 0.5650558
## 4 0.3 0.5650558
## 5 0.4 0.5650558
## 6 0.5 0.5650558
## 7 0.6 0.5650558
## 8 0.7 0.5650558
## 9 0.8 0.1562500
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.7000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.All.X.no.rnorm.rpart.N
## 1 N 1671
## 2 Y 192
## Popular.fctr.predict.All.X.no.rnorm.rpart.Y
## 1 42
## 2 152
## Prediction
## Reference N Y
## N 1671 42
## Y 192 152
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.862421e-01 5.054039e-01 8.717239e-01 8.996488e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.783557e-12 2.026854e-22
## model_id model_method
## 1 All.X.no.rnorm.rpart rpart
## feats
## 1 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 17.7 3.558
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.7277461 0.7 0.5978351 0.8967604
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8826068 0.9010121 0.5758734 0.7084504
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.7 0.5650558 0.8862421
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8717239 0.8996488 0.5054039
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005827668 0.04507744
# User specified
# easier to exclude features
#model_id_pfx <- "";
# indep_vars_vctr <- setdiff(names(glb_fitobs_df),
# union(union(glb_rsp_var, glb_exclude_vars_as_features),
# c("<feat1_name>", "<feat2_name>")))
# method <- ""
# easier to include features
#model_id_pfx <- ""; indep_vars_vctr <- c("<feat1_name>", "<feat1_name>"); method <- ""
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_fitobs_df),
# union(glb_rsp_var, glb_exclude_vars_as_features)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(model_id=paste0(model_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glb_model_metric_terms,
# model_summaryFunction=glb_model_metric_smmry,
# model_metric=glb_model_metric,
# model_metric_maximize=glb_model_metric_maximize)
# Simplify a model
# fit_df <- glb_fitobs_df; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glb_fitobs_df, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glb_model_metric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## model_id model_method
## 1 MFO.myMFO_classfr myMFO_classfr
## 2 Random.myrandom_classfr myrandom_classfr
## 3 Max.cor.Y.cv.0.rpart rpart
## 4 Max.cor.Y.cv.0.cp.0.rpart rpart
## 5 Max.cor.Y.rpart rpart
## 6 Max.cor.Y.glm glm
## 7 Interact.High.cor.Y.glm glm
## 8 Low.cor.X.glm glm
## 9 All.X.glm glm
## 10 All.X.no.rnorm.rpart rpart
## feats
## 1 .rnorm
## 2 .rnorm
## 3 A.nuppr.log
## 4 A.nuppr.log
## 5 A.nuppr.log
## 6 A.nuppr.log
## 7 A.nuppr.log, A.nuppr.log:A.nstopwrds.log, A.nuppr.log:A.sum.TfIdf, A.nuppr.log:S.ratio.nstopwrds.nwrds, A.nuppr.log:A.npnct19.log, A.nuppr.log:S.T.make, A.nuppr.log:H.npnct16.log, A.nuppr.log:S.npnct01.log, A.nuppr.log:S.T.can, A.nuppr.log:A.npnct21.log, A.nuppr.log:S.T.said, A.nuppr.log:A.npnct23.log, A.nuppr.log:S.T.one, A.nuppr.log:S.npnct07.log, A.nuppr.log:A.npnct18.log, A.nuppr.log:S.npnct03.log, A.nuppr.log:A.P.http, A.nuppr.log:A.npnct02.log, A.nuppr.log:S.P.year.colon, A.nuppr.log:S.T.obama, A.nuppr.log:S.npnct20.log, A.nuppr.log:S.P.first.draft, A.nuppr.log:S.T.take, A.nuppr.log:S.npnct06.log, A.nuppr.log:A.npnct17.log, A.nuppr.log:S.T.time, A.nuppr.log:S.T.new, A.nuppr.log:S.P.metropolitan.diary.colon, A.nuppr.log:H.T.polit, A.nuppr.log:A.T.year, A.nuppr.log:S.npnct12.log, A.nuppr.log:H.T.read, A.nuppr.log:A.T.will, A.nuppr.log:H.T.word, A.nuppr.log:A.T.senat, A.nuppr.log:S.T.show, A.nuppr.log:S.T.day, A.nuppr.log:S.npnct28.log, A.nuppr.log:H.T.clip, A.nuppr.log:A.T.first, A.nuppr.log:H.P.first.draft, A.nuppr.log:A.T.newyork, A.nuppr.log:A.T.report, A.nuppr.log:A.T.compani, A.nuppr.log:A.T.word, A.nuppr.log:A.npnct28.log, A.nuppr.log:A.T.newyorktim, A.nuppr.log:S.T.share, A.nuppr.log:H.T.billion, A.nuppr.log:A.npnct13.log, A.nuppr.log:A.T.articl, A.nuppr.log:H.P.today.in.politic, A.nuppr.log:H.T.springsumm, A.nuppr.log:S.T.diari, A.nuppr.log:S.npnct04.log, A.nuppr.log:H.T.report, A.nuppr.log:A.T.diari, A.nuppr.log:S.T.herald, A.nuppr.log:A.T.photo, A.nuppr.log:S.npnct15.log, A.nuppr.log:A.T.intern, A.nuppr.log:A.T.herald, A.nuppr.log:S.P.fashion.week, A.nuppr.log:S.T.intern, A.nuppr.log:A.T.archiv, A.nuppr.log:H.P.fashion.week, A.nuppr.log:H.T.X2015, A.nuppr.log:A.T.week, A.nuppr.log:S.npnct11.log, A.nuppr.log:S.nstopwrds.log, A.nuppr.log:S.ndgts.log, A.nuppr.log:H.nwrds.log, A.nuppr.log:S.nwrds.log, A.nuppr.log:H.nuppr.log, A.nuppr.log:A.nwrds.log, A.nuppr.log:S.nchrs.log, A.nuppr.log:S.nuppr.log
## 8 WordCount.log, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, A.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, S.npnct01.log, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, S.T.one, H.P.s.notebook, H.T.take, A.npnct16.log, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, A.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, S.P.year.colon, S.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, S.npnct14.log, H.P.on.this.day, S.P.first.draft, S.T.take, S.npnct06.log, S.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, A.T.year, A.T.will, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.show, H.P.today.in.smallbusiness, S.T.day, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.T.clip, S.P.daily.clip.report, A.T.first, H.T.news, H.T.X2014, A.T.newyork, A.T.report, A.T.compani, A.T.word, H.T.busi, A.T.newyorktim, A.npnct13.log, S.T.share, A.T.articl, H.T.newyork, H.T.springsumm, H.T.day, S.T.diari, H.T.report, S.npnct04.log, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, A.T.intern, S.T.tribun, S.P.fashion.week, S.T.archiv, H.P.fashion.week, H.npnct15.log, A.T.fashion, A.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, S.nwrds.log, A.nchrs.log, A.nwrds.unq.log, S.nuppr.log
## 9 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## 10 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.765 0.003
## 2 0 0.349 0.002
## 3 0 0.686 0.057
## 4 0 0.607 0.056
## 5 1 1.207 0.057
## 6 1 1.228 0.080
## 7 1 4.292 1.812
## 8 1 19.897 9.476
## 9 1 28.039 14.339
## 10 3 17.700 3.558
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5000000 0.5 0.0000000 0.8326257
## 2 0.5072166 0.1 0.2867534 0.1673743
## 3 0.5000000 0.5 0.0000000 0.8326257
## 4 0.5000000 0.5 0.0000000 0.8326257
## 5 0.5000000 0.5 0.0000000 0.8326258
## 6 0.7073742 0.2 0.3986014 0.8324022
## 7 0.8158344 0.3 0.4834636 0.8480454
## 8 0.9622452 0.4 0.7758389 0.8905043
## 9 0.9641175 0.4 0.7799331 0.8905030
## 10 0.7277461 0.7 0.5978351 0.8967604
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0.0000000000 0.5000000
## 2 0.1565447 0.1786398 0.0000000000 0.4877001
## 3 0.8213602 0.8434553 0.0000000000 0.5000000
## 4 0.8213602 0.8434553 0.0000000000 0.5000000
## 5 0.8213602 0.8434553 0.0000000000 0.5000000
## 6 0.7176970 0.7439004 -0.0004459345 0.7102060
## 7 0.7961944 0.8194916 0.2423448621 0.7911694
## 8 0.9172705 0.9328952 0.5860953239 0.9229354
## 9 0.9184402 0.9339586 0.5963535508 0.9159268
## 10 0.8826068 0.9010121 0.5758734208 0.7084504
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.0000000 0.8327662
## 2 0.1 0.2865473 0.1672338
## 3 0.5 0.0000000 0.8327662
## 4 0.5 0.0000000 0.8327662
## 5 0.5 0.0000000 0.8327662
## 6 0.2 0.3880266 0.7316480
## 7 0.3 0.4651774 0.8021390
## 8 0.3 0.7438017 0.9095771
## 9 0.3 0.7173913 0.8988819
## 10 0.7 0.5650558 0.8862421
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0.0000000
## 2 0.1513467 0.1840753 0.0000000
## 3 0.8159247 0.8486533 0.0000000
## 4 0.8159247 0.8486533 0.0000000
## 5 0.8159247 0.8486533 0.0000000
## 6 0.7119353 0.7506985 0.2283681
## 7 0.7842544 0.8191582 0.3451612
## 8 0.8963496 0.9216231 0.6890845
## 9 0.8850365 0.9115781 0.6561351
## 10 0.8717239 0.8996488 0.5054039
## max.AccuracySD.fit max.KappaSD.fit min.aic.fit
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 0.0002791548 0.0000000000 NA
## 6 0.0000648833 0.0007723812 3714.601
## 7 0.0072076929 0.0311783170 3300.299
## 8 0.0219372388 0.0890361630 2088.981
## 9 0.0087438210 0.0323972025 2140.884
## 10 0.0058276684 0.0450774444 NA
rm(ret_lst)
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end",
major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 3 fit.models_1_rpart 3 0 466.394 489.062 22.668
## 4 fit.models_1_end 4 0 489.063 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 11 fit.models 7 1 425.869 489.069 63.2
## 12 fit.models 7 2 489.070 NA NA
if (!is.null(glb_model_metric_smmry)) {
stats_df <- glb_models_df[, "model_id", FALSE]
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_fitobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "fit",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_OOBobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "OOB",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
# tmp_models_df <- orderBy(~model_id, glb_models_df)
# rownames(tmp_models_df) <- seq(1, nrow(tmp_models_df))
# all.equal(subset(tmp_models_df[, names(stats_df)], model_id != "Random.myrandom_classfr"),
# subset(stats_df, model_id != "Random.myrandom_classfr"))
# print(subset(tmp_models_df[, names(stats_df)], model_id != "Random.myrandom_classfr")[, c("model_id", "max.Accuracy.fit")])
# print(subset(stats_df, model_id != "Random.myrandom_classfr")[, c("model_id", "max.Accuracy.fit")])
print("Merging following data into glb_models_df:")
print(stats_mrg_df <- stats_df[, c(1, grep(glb_model_metric, names(stats_df)))])
print(tmp_models_df <- orderBy(~model_id, glb_models_df[, c("model_id", grep(glb_model_metric, names(stats_df), value=TRUE))]))
tmp2_models_df <- glb_models_df[, c("model_id", setdiff(names(glb_models_df), grep(glb_model_metric, names(stats_df), value=TRUE)))]
tmp3_models_df <- merge(tmp2_models_df, stats_mrg_df, all.x=TRUE, sort=FALSE)
print(tmp3_models_df)
print(names(tmp3_models_df))
print(glb_models_df <- subset(tmp3_models_df, select=-model_id.1))
}
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## model_id model_method
## 1 MFO.myMFO_classfr myMFO_classfr
## 2 Random.myrandom_classfr myrandom_classfr
## 3 Max.cor.Y.cv.0.rpart rpart
## 4 Max.cor.Y.cv.0.cp.0.rpart rpart
## 5 Max.cor.Y.rpart rpart
## 6 Max.cor.Y.glm glm
## 7 Interact.High.cor.Y.glm glm
## 8 Low.cor.X.glm glm
## 9 All.X.glm glm
## 10 All.X.no.rnorm.rpart rpart
## feats
## 1 .rnorm
## 2 .rnorm
## 3 A.nuppr.log
## 4 A.nuppr.log
## 5 A.nuppr.log
## 6 A.nuppr.log
## 7 A.nuppr.log, A.nuppr.log:A.nstopwrds.log, A.nuppr.log:A.sum.TfIdf, A.nuppr.log:S.ratio.nstopwrds.nwrds, A.nuppr.log:A.npnct19.log, A.nuppr.log:S.T.make, A.nuppr.log:H.npnct16.log, A.nuppr.log:S.npnct01.log, A.nuppr.log:S.T.can, A.nuppr.log:A.npnct21.log, A.nuppr.log:S.T.said, A.nuppr.log:A.npnct23.log, A.nuppr.log:S.T.one, A.nuppr.log:S.npnct07.log, A.nuppr.log:A.npnct18.log, A.nuppr.log:S.npnct03.log, A.nuppr.log:A.P.http, A.nuppr.log:A.npnct02.log, A.nuppr.log:S.P.year.colon, A.nuppr.log:S.T.obama, A.nuppr.log:S.npnct20.log, A.nuppr.log:S.P.first.draft, A.nuppr.log:S.T.take, A.nuppr.log:S.npnct06.log, A.nuppr.log:A.npnct17.log, A.nuppr.log:S.T.time, A.nuppr.log:S.T.new, A.nuppr.log:S.P.metropolitan.diary.colon, A.nuppr.log:H.T.polit, A.nuppr.log:A.T.year, A.nuppr.log:S.npnct12.log, A.nuppr.log:H.T.read, A.nuppr.log:A.T.will, A.nuppr.log:H.T.word, A.nuppr.log:A.T.senat, A.nuppr.log:S.T.show, A.nuppr.log:S.T.day, A.nuppr.log:S.npnct28.log, A.nuppr.log:H.T.clip, A.nuppr.log:A.T.first, A.nuppr.log:H.P.first.draft, A.nuppr.log:A.T.newyork, A.nuppr.log:A.T.report, A.nuppr.log:A.T.compani, A.nuppr.log:A.T.word, A.nuppr.log:A.npnct28.log, A.nuppr.log:A.T.newyorktim, A.nuppr.log:S.T.share, A.nuppr.log:H.T.billion, A.nuppr.log:A.npnct13.log, A.nuppr.log:A.T.articl, A.nuppr.log:H.P.today.in.politic, A.nuppr.log:H.T.springsumm, A.nuppr.log:S.T.diari, A.nuppr.log:S.npnct04.log, A.nuppr.log:H.T.report, A.nuppr.log:A.T.diari, A.nuppr.log:S.T.herald, A.nuppr.log:A.T.photo, A.nuppr.log:S.npnct15.log, A.nuppr.log:A.T.intern, A.nuppr.log:A.T.herald, A.nuppr.log:S.P.fashion.week, A.nuppr.log:S.T.intern, A.nuppr.log:A.T.archiv, A.nuppr.log:H.P.fashion.week, A.nuppr.log:H.T.X2015, A.nuppr.log:A.T.week, A.nuppr.log:S.npnct11.log, A.nuppr.log:S.nstopwrds.log, A.nuppr.log:S.ndgts.log, A.nuppr.log:H.nwrds.log, A.nuppr.log:S.nwrds.log, A.nuppr.log:H.nuppr.log, A.nuppr.log:A.nwrds.log, A.nuppr.log:S.nchrs.log, A.nuppr.log:S.nuppr.log
## 8 WordCount.log, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, A.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, S.npnct01.log, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, S.T.one, H.P.s.notebook, H.T.take, A.npnct16.log, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, A.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, S.P.year.colon, S.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, S.npnct14.log, H.P.on.this.day, S.P.first.draft, S.T.take, S.npnct06.log, S.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, A.T.year, A.T.will, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.show, H.P.today.in.smallbusiness, S.T.day, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.T.clip, S.P.daily.clip.report, A.T.first, H.T.news, H.T.X2014, A.T.newyork, A.T.report, A.T.compani, A.T.word, H.T.busi, A.T.newyorktim, A.npnct13.log, S.T.share, A.T.articl, H.T.newyork, H.T.springsumm, H.T.day, S.T.diari, H.T.report, S.npnct04.log, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, A.T.intern, S.T.tribun, S.P.fashion.week, S.T.archiv, H.P.fashion.week, H.npnct15.log, A.T.fashion, A.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, S.nwrds.log, A.nchrs.log, A.nwrds.unq.log, S.nuppr.log
## 9 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, .rnorm, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## 10 WordCount.log, A.ratio.sum.TfIdf.nwrds, S.ratio.sum.TfIdf.nwrds, H.ratio.sum.TfIdf.nwrds, .clusterid.fctr, H.sum.TfIdf, S.sum.TfIdf, A.sum.TfIdf, PubDate.hour.fctr, H.npnct19.log, A.ratio.nstopwrds.nwrds, S.ratio.nstopwrds.nwrds, PubDate.wkend, H.P.recap.colon, H.P.quandary, H.P.no.comment.colon, S.npnct19.log, H.P.facts.figures, H.npnct08.log, PubDate.last10.log, PubDate.last1.log, H.P.readers.respond, A.T.make, S.T.make, H.ratio.nstopwrds.nwrds, H.T.get, H.npnct06.log, S.npnct01.log, A.T.can, H.npnct16.log, S.T.can, H.T.ebola, H.npnct01.log, A.T.said, S.T.said, H.T.make, H.npnct11.log, myCategory.fctr, A.T.one, S.T.one, H.P.s.notebook, H.T.take, S.npnct16.log, A.T.presid, S.T.presid, S.npnct08.log, PubDate.last100.log, H.npnct05.log, H.P.friday.night.music, H.T.say, H.T.obama, H.T.bank, PubDate.date.fctr, PubDate.second.fctr, H.npnct07.log, S.npnct07.log, S.npnct03.log, A.npnct18.log, H.npnct12.log, H.T.word, H.T.big, A.npnct02.log, A.npnct17.log, S.P.year.colon, S.T.obama, A.T.obama, S.npnct20.log, H.npnct02.log, H.T.test, H.P.on.this.day, S.P.first.draft, S.T.take, A.T.take, S.npnct06.log, A.npnct14.log, S.T.time, A.T.time, H.T.newyorktim, H.npnct13.log, H.T.deal, S.T.new, A.T.new, H.T.billion, S.P.metropolitan.diary.colon, H.T.polit, H.P.verbatim.colon, H.T.china, H.T.art, PubDate.minute.fctr, H.T.read, S.npnct12.log, H.P.today.in.politic, A.T.year, S.T.year, H.P.what.we.are, A.T.will, S.T.will, A.T.appear, S.T.appear, PubDate.wkday.fctr, H.T.pictur, H.T.new, A.T.senat, S.T.senat, S.T.show, A.T.show, H.P.today.in.smallbusiness, S.T.day, A.T.day, S.npnct28.log, A.npnct28.log, H.P.daily.clip.report, H.T.clip, A.T.first, H.T.news, S.T.first, H.T.first, H.T.X2014, A.T.newyork, S.T.newyork, A.T.report, A.T.compani, S.T.report, S.T.compani, A.T.word, S.T.word, H.T.morn, H.T.busi, A.T.newyorktim, S.T.newyorktim, A.npnct13.log, A.T.share, S.T.share, H.npnct04.log, S.npnct13.log, A.T.articl, S.T.articl, H.T.newyork, H.T.today, H.T.springsumm, H.T.day, H.npnct14.log, A.T.diari, S.T.diari, H.T.report, S.npnct04.log, H.T.daili, H.T.X2015, A.T.herald, S.T.herald, S.npnct15.log, H.T.week, A.T.photo, S.T.photo, A.T.intern, S.T.intern, A.T.tribun, S.T.tribun, S.P.fashion.week, A.T.archiv, S.T.archiv, H.P.fashion.week, H.P.year.colon, H.T.fashion, H.npnct15.log, A.T.fashion, S.T.fashion, A.T.week, S.T.week, H.nstopwrds.log, H.npnct28.log, S.npnct11.log, S.nstopwrds.log, A.nstopwrds.log, H.ndgts.log, S.ndgts.log, H.nuppr.log, H.nwrds.log, H.nchrs.log, S.nwrds.log, A.nwrds.log, H.nwrds.unq.log, S.nchrs.log, A.nwrds.unq.log, S.nwrds.unq.log, S.nuppr.log
## max.nTuningRuns max.auc.fit opt.prob.threshold.fit max.f.score.fit
## 1 0 0.5000000 0.5 0.0000000
## 2 0 0.5072166 0.1 0.2867534
## 3 0 0.5000000 0.5 0.0000000
## 4 0 0.5000000 0.5 0.0000000
## 5 1 0.5000000 0.5 0.0000000
## 6 1 0.7073742 0.2 0.3986014
## 7 1 0.8158344 0.3 0.4834636
## 8 1 0.9622452 0.4 0.7758389
## 9 1 0.9641175 0.4 0.7799331
## 10 3 0.7277461 0.7 0.5978351
## max.Accuracy.fit max.Kappa.fit max.auc.OOB opt.prob.threshold.OOB
## 1 0.8326257 0.0000000000 0.5000000 0.5
## 2 0.1673743 0.0000000000 0.4877001 0.1
## 3 0.8326257 0.0000000000 0.5000000 0.5
## 4 0.8326257 0.0000000000 0.5000000 0.5
## 5 0.8326258 0.0000000000 0.5000000 0.5
## 6 0.8324022 -0.0004459345 0.7102060 0.2
## 7 0.8480454 0.2423448621 0.7911694 0.3
## 8 0.8905043 0.5860953239 0.9229354 0.3
## 9 0.8905030 0.5963535508 0.9159268 0.3
## 10 0.8967604 0.5758734208 0.7084504 0.7
## max.f.score.OOB max.Accuracy.OOB max.Kappa.OOB
## 1 0.0000000 0.8327662 0.0000000
## 2 0.2865473 0.1672338 0.0000000
## 3 0.0000000 0.8327662 0.0000000
## 4 0.0000000 0.8327662 0.0000000
## 5 0.0000000 0.8327662 0.0000000
## 6 0.3880266 0.7316480 0.2283681
## 7 0.4651774 0.8021390 0.3451612
## 8 0.7438017 0.9095771 0.6890845
## 9 0.7173913 0.8988819 0.6561351
## 10 0.5650558 0.8862421 0.5054039
## inv.elapsedtime.everything inv.elapsedtime.final inv.aic.fit
## 1 1.30718954 333.33333333 NA
## 2 2.86532951 500.00000001 NA
## 3 1.45772595 17.54385965 NA
## 4 1.64744646 17.85714286 NA
## 5 0.82850041 17.54385965 NA
## 6 0.81433225 12.50000000 0.0002692079
## 7 0.23299161 0.55187638 0.0003030028
## 8 0.05025883 0.10552976 0.0004787023
## 9 0.03566461 0.06973987 0.0004670967
## 10 0.05649718 0.28105677 NA
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 10. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 5 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 60 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 10. Consider specifying shapes manually. if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(model_id %in% grep("random|MFO", plt_models_df$model_id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "model_id", FALSE]
pltCI_models_df <- glb_models_df[, "model_id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="model_id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="model_id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
#print(mltdCI_models_df)
# castCI_models_df <- dcast(mltdCI_models_df, value ~ type, fun.aggregate=sum)
# print(castCI_models_df)
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("model_id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("model_id", "model_method")],
all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
print(gp <- myplot_bar(mltd_models_df, "model_id", "value", colorcol_name="model_method") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=model_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning in loop_apply(n, do.ply): Stacking not well defined when ymin != 0
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning in loop_apply(n, do.ply): Stacking not well defined when ymin != 0
# used for console inspection
model_evl_terms <- c(NULL)
for (metric in glb_model_evl_criteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse=" "))
print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)
[, c("model_id", glb_model_evl_criteria,
ifelse(glb_is_classification && glb_is_binomial,
"opt.prob.threshold.OOB", NULL))])
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 8 Low.cor.X.glm 0.9095771 0.9229354 0.6890845
## 9 All.X.glm 0.8988819 0.9159268 0.6561351
## 10 All.X.no.rnorm.rpart 0.8862421 0.7084504 0.5054039
## 1 MFO.myMFO_classfr 0.8327662 0.5000000 0.0000000
## 3 Max.cor.Y.cv.0.rpart 0.8327662 0.5000000 0.0000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.8327662 0.5000000 0.0000000
## 5 Max.cor.Y.rpart 0.8327662 0.5000000 0.0000000
## 7 Interact.High.cor.Y.glm 0.8021390 0.7911694 0.3451612
## 6 Max.cor.Y.glm 0.7316480 0.7102060 0.2283681
## 2 Random.myrandom_classfr 0.1672338 0.4877001 0.0000000
## min.aic.fit opt.prob.threshold.OOB
## 8 2088.981 0.3
## 9 2140.884 0.3
## 10 NA 0.7
## 1 NA 0.5
## 3 NA 0.5
## 4 NA 0.5
## 5 NA 0.5
## 7 3300.299 0.3
## 6 3714.601 0.2
## 2 NA 0.1
print(myplot_radar(radar_inp_df=dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 10. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 27 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 10. Consider specifying shapes manually. if you must have them.
print("Metrics used for model selection:"); print(model_sel_frmla)
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.auc.OOB - max.Kappa.OOB + min.aic.fit -
## opt.prob.threshold.OOB
print(sprintf("Best model id: %s", dsp_models_df[1, "model_id"]))
## [1] "Best model id: Low.cor.X.glm"
if (is.null(glb_sel_mdl_id)) {
glb_sel_mdl_id <- dsp_models_df[1, "model_id"]
if (glb_sel_mdl_id == "Interact.High.cor.Y.glm") {
warning("glb_sel_mdl_id: Interact.High.cor.Y.glm; myextract_mdl_feats does not currently support interaction terms")
glb_sel_mdl_id <- dsp_models_df[2, "model_id"]
}
} else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1668 -0.2703 -0.0764 0.0000 3.9538
##
## Coefficients: (11 not defined because of singularities)
## Estimate
## (Intercept) -5.030e+00
## WordCount.log 1.328e+00
## S.ratio.sum.TfIdf.nwrds 1.352e+00
## H.ratio.sum.TfIdf.nwrds 2.089e-01
## .clusterid.fctr101 -1.514e+00
## .clusterid.fctr102 -1.983e+00
## .clusterid.fctr103 -2.060e+00
## .clusterid.fctr104 -1.689e+00
## .clusterid.fctr401 -6.466e+00
## .clusterid.fctr402 -2.233e+01
## .clusterid.fctr403 -4.802e+00
## .clusterid.fctr404 -4.324e+00
## .clusterid.fctr405 -3.755e+00
## .clusterid.fctr406 -4.902e+00
## .clusterid.fctr407 -2.207e+01
## .clusterid.fctr408 -6.368e+00
## .clusterid.fctr409 -6.186e+00
## .clusterid.fctr410 -2.238e+01
## .clusterid.fctr411 -2.151e+01
## .clusterid.fctr412 -2.222e+01
## .clusterid.fctr413 -2.284e+01
## .clusterid.fctr414 -2.690e+00
## .clusterid.fctr415 -2.239e+01
## .clusterid.fctr501 -3.957e+00
## .clusterid.fctr502 -5.542e+00
## .clusterid.fctr503 -5.543e+00
## .clusterid.fctr504 -5.790e+00
## .clusterid.fctr505 -4.697e+00
## .clusterid.fctr506 -5.054e+00
## .clusterid.fctr507 -5.145e+00
## .clusterid.fctr508 -5.611e+00
## .clusterid.fctr509 -4.861e+00
## .clusterid.fctr510 -4.345e+00
## .clusterid.fctr511 -5.468e+00
## .clusterid.fctr512 -4.409e+00
## .clusterid.fctr513 -4.506e+00
## .clusterid.fctr701 -3.658e+00
## .clusterid.fctr702 -3.950e+00
## .clusterid.fctr703 -2.096e+01
## .clusterid.fctr704 -4.420e+00
## .clusterid.fctr705 -4.503e+00
## .clusterid.fctr706 -2.953e+00
## .clusterid.fctr707 -5.127e+00
## .clusterid.fctr1101 -1.007e+00
## .clusterid.fctr1102 2.173e-01
## .clusterid.fctr1103 1.037e-01
## .clusterid.fctr1104 -1.254e+00
## .clusterid.fctr1105 -1.743e+00
## .clusterid.fctr1106 -1.235e+00
## .clusterid.fctr1107 -3.117e+00
## .clusterid.fctr1108 -1.335e+00
## .clusterid.fctr1109 1.868e+01
## .clusterid.fctr1501 -4.402e+00
## .clusterid.fctr1502 -4.866e+00
## .clusterid.fctr1503 -3.608e+00
## .clusterid.fctr1504 -5.552e+00
## .clusterid.fctr1505 -4.782e+00
## .clusterid.fctr1506 -3.150e+00
## .clusterid.fctr1507 -4.600e+00
## .clusterid.fctr1508 -4.558e+00
## .clusterid.fctr1509 -4.017e+00
## .clusterid.fctr1510 -5.980e+00
## .clusterid.fctr1511 -2.309e+01
## .clusterid.fctr1512 -4.016e+00
## .clusterid.fctr1513 -4.971e+00
## .clusterid.fctr1514 -1.867e+01
## .clusterid.fctr1515 -6.533e+00
## .clusterid.fctr1516 -3.658e+00
## .clusterid.fctr1517 -3.088e+00
## .clusterid.fctr1518 -6.116e+00
## .clusterid.fctr1519 -5.024e+00
## .clusterid.fctr1520 -1.615e+01
## .clusterid.fctr1521 -4.031e+00
## .clusterid.fctr1522 -4.525e+00
## .clusterid.fctr1523 -2.194e+01
## .clusterid.fctr1524 -3.368e+00
## .clusterid.fctr1801 -2.967e+00
## .clusterid.fctr1802 -2.315e+00
## .clusterid.fctr1803 -1.908e+00
## .clusterid.fctr1804 -1.858e+00
## H.sum.TfIdf -1.452e-01
## A.sum.TfIdf -1.593e-01
## `PubDate.hour.fctr(7.67,15.3]` 1.913e-01
## `PubDate.hour.fctr(15.3,23]` 3.653e-01
## H.npnct19.log 1.693e+00
## S.ratio.nstopwrds.nwrds -6.188e+00
## PubDate.wkend -1.808e-01
## H.P.recap.colon 1.840e+00
## H.P.quandary 2.194e+01
## H.P.no.comment.colon 2.040e+00
## A.npnct19.log 1.366e+00
## H.P.facts.figures 9.467e-01
## H.npnct08.log 1.208e+00
## PubDate.last10.log 1.838e-01
## PubDate.last1.log -2.865e-02
## H.P.readers.respond 6.912e+00
## S.T.make -1.305e+00
## H.ratio.nstopwrds.nwrds 5.443e+00
## H.T.get 4.871e-01
## S.npnct01.log 2.462e+00
## H.npnct16.log 5.659e-01
## S.T.can -1.619e+00
## H.T.ebola -1.239e-01
## H.npnct01.log -1.264e+00
## S.T.said 7.922e-01
## H.T.make -3.328e-01
## H.npnct11.log 4.187e-01
## `myCategory.fctrForeign#World#Asia Pacific` -5.996e+00
## `myCategory.fctr#Multimedia#` -6.083e+00
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther -2.268e+01
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` -6.647e-01
## `myCategory.fctrTStyle##` -6.193e+00
## `myCategory.fctrForeign#World#` -2.160e+01
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` -2.275e+01
## `myCategory.fctr#Opinion#Room For Debate` -8.740e+00
## `myCategory.fctr#U.S.#Education` -2.379e+01
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` -3.691e+00
## `myCategory.fctrBusiness#Business Day#Small Business` -6.186e+00
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` -5.764e+00
## `myCategory.fctr#Opinion#The Public Editor` NA
## S.T.one -1.001e+00
## H.P.s.notebook -1.685e+01
## H.T.take -4.009e-01
## A.npnct16.log -6.838e-01
## S.npnct16.log NA
## A.T.presid 4.792e+02
## S.T.presid -4.788e+02
## S.npnct08.log 1.130e+00
## A.npnct08.log NA
## PubDate.last100.log 1.915e-02
## .rnorm -7.205e-02
## H.npnct05.log -2.508e+01
## H.P.friday.night.music -2.342e+00
## H.T.say -4.675e-01
## H.T.obama -1.512e-01
## H.T.bank -6.400e-02
## `PubDate.date.fctr(7,13]` -2.675e-02
## `PubDate.date.fctr(13,19]` -1.162e-01
## `PubDate.date.fctr(19,25]` -1.139e-01
## `PubDate.date.fctr(25,31]` 8.689e-02
## `PubDate.second.fctr(14.8,29.5]` 8.971e-02
## `PubDate.second.fctr(29.5,44.2]` -1.987e-02
## `PubDate.second.fctr(44.2,59.1]` -2.213e-01
## H.npnct07.log 3.248e-01
## S.npnct07.log -2.521e+01
## S.npnct03.log -2.993e+01
## A.npnct18.log -2.802e+01
## H.npnct12.log 4.017e-01
## H.T.word 2.508e+00
## H.T.big -3.376e-01
## S.P.year.colon -1.058e+01
## S.T.obama -8.676e-01
## S.npnct20.log -2.684e+01
## H.npnct02.log -1.859e+01
## H.T.test -1.587e-01
## S.npnct14.log 8.653e-01
## H.P.on.this.day -1.454e+01
## S.P.first.draft -1.548e+01
## S.T.take -1.358e+00
## S.npnct06.log 7.141e-01
## S.T.time -1.124e+00
## H.T.newyorktim -1.221e-01
## H.npnct13.log -2.796e-01
## H.T.deal -2.353e+01
## S.T.new 1.629e-02
## H.T.billion -8.357e-02
## S.P.metropolitan.diary.colon -8.723e+00
## H.T.polit -5.421e-01
## H.P.verbatim.colon -1.454e+01
## H.T.china -9.870e-01
## H.T.art -1.212e+00
## `PubDate.minute.fctr(14.8,29.5]` -2.019e-01
## `PubDate.minute.fctr(29.5,44.2]` -2.394e-01
## `PubDate.minute.fctr(44.2,59.1]` 6.186e-02
## H.T.read -1.092e+00
## S.npnct12.log -1.700e-01
## A.T.year -2.826e-01
## A.T.will -1.028e+00
## S.T.appear -5.878e-01
## PubDate.wkday.fctr1 -2.992e-01
## PubDate.wkday.fctr2 -8.611e-01
## PubDate.wkday.fctr3 -4.455e-01
## PubDate.wkday.fctr4 -6.742e-01
## PubDate.wkday.fctr5 -5.502e-01
## PubDate.wkday.fctr6 -1.061e+00
## H.T.pictur 1.411e-01
## H.T.new -5.904e-01
## A.T.senat 8.068e-01
## S.T.show -1.353e+00
## H.P.today.in.smallbusiness -1.560e+01
## S.T.day -9.731e-01
## H.P.first.draft -1.518e+01
## S.npnct28.log -1.494e+01
## H.P.daily.clip.report -1.570e+01
## H.T.clip NA
## S.P.daily.clip.report NA
## A.T.first 1.077e+00
## H.T.news -6.977e-01
## H.T.X2014 -6.582e-01
## A.T.newyork 2.368e+00
## A.T.report -1.648e+00
## A.T.compani -6.329e-01
## A.T.word -7.426e-01
## H.T.busi -5.133e-01
## A.T.newyorktim 2.318e+00
## A.npnct13.log 1.028e+00
## S.T.share -1.787e+00
## A.T.articl -3.262e+00
## H.T.newyork -8.485e-01
## H.T.springsumm -1.186e+01
## H.T.day -4.700e-01
## S.T.diari 2.075e+01
## H.T.report -1.296e+00
## S.npnct04.log -1.227e+00
## S.T.herald 5.240e+01
## S.npnct15.log 8.196e-02
## H.T.week -8.774e-01
## A.T.photo -2.082e+00
## A.T.intern -3.241e+00
## S.T.tribun -4.993e+01
## S.P.fashion.week 2.553e+00
## S.T.archiv -4.537e+01
## H.P.fashion.week -1.409e+01
## H.npnct15.log -1.630e+00
## A.T.fashion -5.737e+01
## A.T.week -1.993e-01
## H.nstopwrds.log -1.233e+00
## H.npnct28.log -1.532e+00
## S.npnct11.log -1.121e-01
## S.nstopwrds.log 2.222e+00
## H.ndgts.log 6.602e-01
## S.ndgts.log 2.324e-01
## H.nuppr.log 1.569e+00
## H.nwrds.log -3.646e-01
## S.nwrds.log -3.602e-01
## A.nchrs.log -2.296e-01
## A.nwrds.unq.log -1.267e+00
## S.nuppr.log -5.401e-01
## Std. Error z value
## (Intercept) 5.314e+00 -0.947
## WordCount.log 1.139e-01 11.662
## S.ratio.sum.TfIdf.nwrds 6.247e-01 2.164
## H.ratio.sum.TfIdf.nwrds 2.001e-01 1.044
## .clusterid.fctr101 1.555e+00 -0.973
## .clusterid.fctr102 1.584e+00 -1.252
## .clusterid.fctr103 1.684e+00 -1.223
## .clusterid.fctr104 1.714e+00 -0.985
## .clusterid.fctr401 2.015e+00 -3.209
## .clusterid.fctr402 2.623e+03 -0.009
## .clusterid.fctr403 1.934e+00 -2.483
## .clusterid.fctr404 1.710e+00 -2.528
## .clusterid.fctr405 1.632e+00 -2.301
## .clusterid.fctr406 1.863e+00 -2.631
## .clusterid.fctr407 2.563e+03 -0.009
## .clusterid.fctr408 2.030e+00 -3.137
## .clusterid.fctr409 2.038e+00 -3.035
## .clusterid.fctr410 3.419e+03 -0.007
## .clusterid.fctr411 4.105e+03 -0.005
## .clusterid.fctr412 3.749e+03 -0.006
## .clusterid.fctr413 4.528e+03 -0.005
## .clusterid.fctr414 1.803e+00 -1.492
## .clusterid.fctr415 5.275e+03 -0.004
## .clusterid.fctr501 1.555e+00 -2.545
## .clusterid.fctr502 1.629e+00 -3.403
## .clusterid.fctr503 1.665e+00 -3.329
## .clusterid.fctr504 1.692e+00 -3.421
## .clusterid.fctr505 1.903e+00 -2.468
## .clusterid.fctr506 1.611e+00 -3.136
## .clusterid.fctr507 1.665e+00 -3.091
## .clusterid.fctr508 1.882e+00 -2.981
## .clusterid.fctr509 1.714e+00 -2.836
## .clusterid.fctr510 1.691e+00 -2.569
## .clusterid.fctr511 1.880e+00 -2.908
## .clusterid.fctr512 1.700e+00 -2.594
## .clusterid.fctr513 1.723e+00 -2.615
## .clusterid.fctr701 1.587e+00 -2.304
## .clusterid.fctr702 1.614e+00 -2.448
## .clusterid.fctr703 2.585e+03 -0.008
## .clusterid.fctr704 1.626e+00 -2.718
## .clusterid.fctr705 1.644e+00 -2.738
## .clusterid.fctr706 1.587e+00 -1.861
## .clusterid.fctr707 1.913e+00 -2.680
## .clusterid.fctr1101 1.548e+00 -0.650
## .clusterid.fctr1102 1.615e+00 0.135
## .clusterid.fctr1103 1.732e+00 0.060
## .clusterid.fctr1104 1.598e+00 -0.785
## .clusterid.fctr1105 1.603e+00 -1.087
## .clusterid.fctr1106 1.679e+00 -0.736
## .clusterid.fctr1107 1.629e+00 -1.914
## .clusterid.fctr1108 1.641e+00 -0.813
## .clusterid.fctr1109 4.922e+03 0.004
## .clusterid.fctr1501 1.612e+00 -2.732
## .clusterid.fctr1502 1.627e+00 -2.990
## .clusterid.fctr1503 1.579e+00 -2.285
## .clusterid.fctr1504 1.668e+00 -3.328
## .clusterid.fctr1505 1.630e+00 -2.933
## .clusterid.fctr1506 1.817e+00 -1.733
## .clusterid.fctr1507 1.766e+00 -2.605
## .clusterid.fctr1508 1.787e+00 -2.551
## .clusterid.fctr1509 1.617e+00 -2.485
## .clusterid.fctr1510 1.956e+00 -3.057
## .clusterid.fctr1511 3.096e+03 -0.007
## .clusterid.fctr1512 1.714e+00 -2.343
## .clusterid.fctr1513 1.875e+00 -2.650
## .clusterid.fctr1514 2.349e+03 -0.008
## .clusterid.fctr1515 1.948e+00 -3.353
## .clusterid.fctr1516 1.631e+00 -2.243
## .clusterid.fctr1517 1.692e+00 -1.826
## .clusterid.fctr1518 1.909e+00 -3.203
## .clusterid.fctr1519 1.682e+00 -2.987
## .clusterid.fctr1520 3.822e+03 -0.004
## .clusterid.fctr1521 1.686e+00 -2.392
## .clusterid.fctr1522 1.712e+00 -2.643
## .clusterid.fctr1523 4.277e+03 -0.005
## .clusterid.fctr1524 2.375e+00 -1.418
## .clusterid.fctr1801 1.558e+00 -1.904
## .clusterid.fctr1802 1.593e+00 -1.453
## .clusterid.fctr1803 1.668e+00 -1.144
## .clusterid.fctr1804 2.153e+00 -0.863
## H.sum.TfIdf 9.693e-02 -1.498
## A.sum.TfIdf 1.228e-01 -1.297
## `PubDate.hour.fctr(7.67,15.3]` 2.668e-01 0.717
## `PubDate.hour.fctr(15.3,23]` 2.734e-01 1.336
## H.npnct19.log 3.624e-01 4.671
## S.ratio.nstopwrds.nwrds 5.160e+00 -1.199
## PubDate.wkend 4.642e-01 -0.389
## H.P.recap.colon 1.237e+00 1.487
## H.P.quandary 6.383e+03 0.003
## H.P.no.comment.colon 1.126e+00 1.812
## A.npnct19.log 4.109e-01 3.325
## H.P.facts.figures 1.518e+00 0.624
## H.npnct08.log 5.060e-01 2.387
## PubDate.last10.log 1.318e-01 1.394
## PubDate.last1.log 4.869e-02 -0.588
## H.P.readers.respond 1.151e+00 6.003
## S.T.make 6.489e-01 -2.012
## H.ratio.nstopwrds.nwrds 2.872e+00 1.895
## H.T.get 4.354e-01 1.119
## S.npnct01.log 2.204e+00 1.117
## H.npnct16.log 7.020e-01 0.806
## S.T.can 8.908e-01 -1.818
## H.T.ebola 3.304e-01 -0.375
## H.npnct01.log 1.389e+00 -0.910
## S.T.said 9.205e-01 0.861
## H.T.make 3.710e-01 -0.897
## H.npnct11.log 2.266e-01 1.848
## `myCategory.fctrForeign#World#Asia Pacific` 1.661e+00 -3.610
## `myCategory.fctr#Multimedia#` 1.717e+00 -3.542
## `myCategory.fctrCulture#Arts#` NA NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA NA
## myCategory.fctrmyOther 2.984e+03 -0.008
## `myCategory.fctrBusiness#Technology#` NA NA
## `myCategory.fctrBusiness#Crosswords/Games#` 1.597e+00 -0.416
## `myCategory.fctrTStyle##` 1.589e+00 -3.898
## `myCategory.fctrForeign#World#` 1.879e+03 -0.011
## `myCategory.fctrOpEd#Opinion#` NA NA
## `myCategory.fctrStyles##Fashion` 1.546e+03 -0.015
## `myCategory.fctr#Opinion#Room For Debate` 1.750e+00 -4.994
## `myCategory.fctr#U.S.#Education` 9.385e+02 -0.025
## `myCategory.fctr##` NA NA
## `myCategory.fctrMetro#N.Y. / Region#` 1.602e+00 -2.303
## `myCategory.fctrBusiness#Business Day#Small Business` 1.665e+00 -3.714
## `myCategory.fctrStyles#U.S.#` NA NA
## `myCategory.fctrTravel#Travel#` 1.845e+00 -3.124
## `myCategory.fctr#Opinion#The Public Editor` NA NA
## S.T.one 6.924e-01 -1.446
## H.P.s.notebook 8.449e+03 -0.002
## H.T.take 4.984e-01 -0.804
## A.npnct16.log 1.385e+00 -0.494
## S.npnct16.log NA NA
## A.T.presid 1.548e+05 0.003
## S.T.presid 1.548e+05 -0.003
## S.npnct08.log 7.335e-01 1.540
## A.npnct08.log NA NA
## PubDate.last100.log 4.824e-02 0.397
## .rnorm 6.898e-02 -1.044
## H.npnct05.log 9.901e+03 -0.003
## H.P.friday.night.music 1.348e+00 -1.737
## H.T.say 4.615e-01 -1.013
## H.T.obama 4.766e-01 -0.317
## H.T.bank 5.200e-01 -0.123
## `PubDate.date.fctr(7,13]` 2.129e-01 -0.126
## `PubDate.date.fctr(13,19]` 2.123e-01 -0.547
## `PubDate.date.fctr(19,25]` 2.074e-01 -0.549
## `PubDate.date.fctr(25,31]` 2.254e-01 0.386
## `PubDate.second.fctr(14.8,29.5]` 1.903e-01 0.471
## `PubDate.second.fctr(29.5,44.2]` 1.861e-01 -0.107
## `PubDate.second.fctr(44.2,59.1]` 1.935e-01 -1.144
## H.npnct07.log 2.276e-01 1.427
## S.npnct07.log 1.106e+04 -0.002
## S.npnct03.log 8.796e+03 -0.003
## A.npnct18.log 8.725e+03 -0.003
## H.npnct12.log 3.478e-01 1.155
## H.T.word 1.021e+00 2.457
## H.T.big 6.423e-01 -0.526
## S.P.year.colon 3.884e+03 -0.003
## S.T.obama 1.498e+00 -0.579
## S.npnct20.log 7.852e+03 -0.003
## H.npnct02.log 4.924e+03 -0.004
## H.T.test 7.261e-01 -0.218
## S.npnct14.log 1.661e+00 0.521
## H.P.on.this.day 5.627e+03 -0.003
## S.P.first.draft 4.076e+03 -0.004
## S.T.take 1.180e+00 -1.151
## S.npnct06.log 1.564e+00 0.457
## S.T.time 9.718e-01 -1.157
## H.T.newyorktim 8.069e-01 -0.151
## H.npnct13.log 2.435e-01 -1.148
## H.T.deal 2.601e+03 -0.009
## S.T.new 8.046e-01 0.020
## H.T.billion 8.772e-01 -0.095
## S.P.metropolitan.diary.colon 3.957e+00 -2.205
## H.T.polit 4.494e-01 -1.206
## H.P.verbatim.colon 3.727e+03 -0.004
## H.T.china 1.083e+00 -0.911
## H.T.art 1.054e+00 -1.150
## `PubDate.minute.fctr(14.8,29.5]` 1.990e-01 -1.015
## `PubDate.minute.fctr(29.5,44.2]` 1.941e-01 -1.233
## `PubDate.minute.fctr(44.2,59.1]` 1.993e-01 0.310
## H.T.read 4.636e-01 -2.356
## S.npnct12.log 2.309e-01 -0.736
## A.T.year 9.978e-01 -0.283
## A.T.will 9.269e-01 -1.109
## S.T.appear 1.307e+00 -0.450
## PubDate.wkday.fctr1 5.611e-01 -0.533
## PubDate.wkday.fctr2 6.108e-01 -1.410
## PubDate.wkday.fctr3 6.047e-01 -0.737
## PubDate.wkday.fctr4 5.951e-01 -1.133
## PubDate.wkday.fctr5 6.051e-01 -0.909
## PubDate.wkday.fctr6 5.472e-01 -1.939
## H.T.pictur 6.933e-01 0.203
## H.T.new 5.492e-01 -1.075
## A.T.senat 9.282e-01 0.869
## S.T.show 1.240e+00 -1.091
## H.P.today.in.smallbusiness 2.939e+03 -0.005
## S.T.day 1.147e+00 -0.849
## H.P.first.draft 2.072e+03 -0.007
## S.npnct28.log 2.075e+03 -0.007
## H.P.daily.clip.report 2.626e+03 -0.006
## H.T.clip NA NA
## S.P.daily.clip.report NA NA
## A.T.first 1.084e+00 0.994
## H.T.news 8.649e-01 -0.807
## H.T.X2014 1.102e+00 -0.597
## A.T.newyork 1.111e+00 2.131
## A.T.report 1.221e+00 -1.350
## A.T.compani 9.492e-01 -0.667
## A.T.word 1.189e+00 -0.625
## H.T.busi 7.450e-01 -0.689
## A.T.newyorktim 1.349e+00 1.718
## A.npnct13.log 3.069e-01 3.350
## S.T.share 1.157e+00 -1.544
## A.T.articl 2.474e+00 -1.318
## H.T.newyork 5.433e-01 -1.562
## H.T.springsumm 1.072e+03 -0.011
## H.T.day 7.523e-01 -0.625
## S.T.diari 8.374e+00 2.479
## H.T.report 9.315e-01 -1.391
## S.npnct04.log 7.741e-01 -1.585
## S.T.herald 4.354e+03 0.012
## S.npnct15.log 5.822e-01 0.141
## H.T.week 7.729e-01 -1.135
## A.T.photo 2.217e+00 -0.939
## A.T.intern 2.799e+00 -1.158
## S.T.tribun 4.729e+03 -0.011
## S.P.fashion.week 1.217e+03 0.002
## S.T.archiv 3.506e+03 -0.013
## H.P.fashion.week 9.439e+02 -0.015
## H.npnct15.log 4.030e-01 -4.045
## A.T.fashion 2.894e+03 -0.020
## A.T.week 9.754e-01 -0.204
## H.nstopwrds.log 6.491e-01 -1.900
## H.npnct28.log 1.868e+00 -0.820
## S.npnct11.log 1.756e-01 -0.638
## S.nstopwrds.log 1.731e+00 1.284
## H.ndgts.log 2.899e-01 2.277
## S.ndgts.log 2.272e-01 1.023
## H.nuppr.log 7.397e-01 2.121
## H.nwrds.log 1.007e+00 -0.362
## S.nwrds.log 2.021e+00 -0.178
## A.nchrs.log 8.840e-01 -0.260
## A.nwrds.unq.log 9.111e-01 -1.390
## S.nuppr.log 1.850e-01 -2.919
## Pr(>|z|)
## (Intercept) 0.343833
## WordCount.log < 2e-16 ***
## S.ratio.sum.TfIdf.nwrds 0.030476 *
## H.ratio.sum.TfIdf.nwrds 0.296357
## .clusterid.fctr101 0.330396
## .clusterid.fctr102 0.210554
## .clusterid.fctr103 0.221327
## .clusterid.fctr104 0.324628
## .clusterid.fctr401 0.001334 **
## .clusterid.fctr402 0.993206
## .clusterid.fctr403 0.013039 *
## .clusterid.fctr404 0.011460 *
## .clusterid.fctr405 0.021368 *
## .clusterid.fctr406 0.008512 **
## .clusterid.fctr407 0.993131
## .clusterid.fctr408 0.001707 **
## .clusterid.fctr409 0.002402 **
## .clusterid.fctr410 0.994777
## .clusterid.fctr411 0.995819
## .clusterid.fctr412 0.995271
## .clusterid.fctr413 0.995975
## .clusterid.fctr414 0.135718
## .clusterid.fctr415 0.996614
## .clusterid.fctr501 0.010933 *
## .clusterid.fctr502 0.000667 ***
## .clusterid.fctr503 0.000870 ***
## .clusterid.fctr504 0.000624 ***
## .clusterid.fctr505 0.013577 *
## .clusterid.fctr506 0.001712 **
## .clusterid.fctr507 0.001996 **
## .clusterid.fctr508 0.002875 **
## .clusterid.fctr509 0.004573 **
## .clusterid.fctr510 0.010205 *
## .clusterid.fctr511 0.003635 **
## .clusterid.fctr512 0.009488 **
## .clusterid.fctr513 0.008923 **
## .clusterid.fctr701 0.021205 *
## .clusterid.fctr702 0.014385 *
## .clusterid.fctr703 0.993531
## .clusterid.fctr704 0.006561 **
## .clusterid.fctr705 0.006179 **
## .clusterid.fctr706 0.062766 .
## .clusterid.fctr707 0.007358 **
## .clusterid.fctr1101 0.515407
## .clusterid.fctr1102 0.892969
## .clusterid.fctr1103 0.952286
## .clusterid.fctr1104 0.432665
## .clusterid.fctr1105 0.276867
## .clusterid.fctr1106 0.461774
## .clusterid.fctr1107 0.055676 .
## .clusterid.fctr1108 0.415977
## .clusterid.fctr1109 0.996972
## .clusterid.fctr1501 0.006302 **
## .clusterid.fctr1502 0.002787 **
## .clusterid.fctr1503 0.022308 *
## .clusterid.fctr1504 0.000873 ***
## .clusterid.fctr1505 0.003354 **
## .clusterid.fctr1506 0.083040 .
## .clusterid.fctr1507 0.009197 **
## .clusterid.fctr1508 0.010744 *
## .clusterid.fctr1509 0.012963 *
## .clusterid.fctr1510 0.002235 **
## .clusterid.fctr1511 0.994050
## .clusterid.fctr1512 0.019127 *
## .clusterid.fctr1513 0.008039 **
## .clusterid.fctr1514 0.993658
## .clusterid.fctr1515 0.000799 ***
## .clusterid.fctr1516 0.024908 *
## .clusterid.fctr1517 0.067919 .
## .clusterid.fctr1518 0.001358 **
## .clusterid.fctr1519 0.002817 **
## .clusterid.fctr1520 0.996628
## .clusterid.fctr1521 0.016777 *
## .clusterid.fctr1522 0.008227 **
## .clusterid.fctr1523 0.995908
## .clusterid.fctr1524 0.156210
## .clusterid.fctr1801 0.056859 .
## .clusterid.fctr1802 0.146205
## .clusterid.fctr1803 0.252556
## .clusterid.fctr1804 0.388296
## H.sum.TfIdf 0.134221
## A.sum.TfIdf 0.194501
## `PubDate.hour.fctr(7.67,15.3]` 0.473219
## `PubDate.hour.fctr(15.3,23]` 0.181527
## H.npnct19.log 2.99e-06 ***
## S.ratio.nstopwrds.nwrds 0.230446
## PubDate.wkend 0.696943
## H.P.recap.colon 0.136953
## H.P.quandary 0.997258
## H.P.no.comment.colon 0.070046 .
## A.npnct19.log 0.000883 ***
## H.P.facts.figures 0.532807
## H.npnct08.log 0.016968 *
## PubDate.last10.log 0.163285
## PubDate.last1.log 0.556274
## H.P.readers.respond 1.94e-09 ***
## S.T.make 0.044263 *
## H.ratio.nstopwrds.nwrds 0.058098 .
## H.T.get 0.263303
## S.npnct01.log 0.263893
## H.npnct16.log 0.420184
## S.T.can 0.069116 .
## H.T.ebola 0.707742
## H.npnct01.log 0.362822
## S.T.said 0.389408
## H.T.make 0.369807
## H.npnct11.log 0.064651 .
## `myCategory.fctrForeign#World#Asia Pacific` 0.000307 ***
## `myCategory.fctr#Multimedia#` 0.000397 ***
## `myCategory.fctrCulture#Arts#` NA
## `myCategory.fctrBusiness#Business Day#Dealbook` NA
## myCategory.fctrmyOther 0.993936
## `myCategory.fctrBusiness#Technology#` NA
## `myCategory.fctrBusiness#Crosswords/Games#` 0.677317
## `myCategory.fctrTStyle##` 9.68e-05 ***
## `myCategory.fctrForeign#World#` 0.990830
## `myCategory.fctrOpEd#Opinion#` NA
## `myCategory.fctrStyles##Fashion` 0.988258
## `myCategory.fctr#Opinion#Room For Debate` 5.92e-07 ***
## `myCategory.fctr#U.S.#Education` 0.979774
## `myCategory.fctr##` NA
## `myCategory.fctrMetro#N.Y. / Region#` 0.021252 *
## `myCategory.fctrBusiness#Business Day#Small Business` 0.000204 ***
## `myCategory.fctrStyles#U.S.#` NA
## `myCategory.fctrTravel#Travel#` 0.001787 **
## `myCategory.fctr#Opinion#The Public Editor` NA
## S.T.one 0.148292
## H.P.s.notebook 0.998409
## H.T.take 0.421198
## A.npnct16.log 0.621389
## S.npnct16.log NA
## A.T.presid 0.997530
## S.T.presid 0.997532
## S.npnct08.log 0.123466
## A.npnct08.log NA
## PubDate.last100.log 0.691411
## .rnorm 0.296258
## H.npnct05.log 0.997979
## H.P.friday.night.music 0.082374 .
## H.T.say 0.311011
## H.T.obama 0.751110
## H.T.bank 0.902043
## `PubDate.date.fctr(7,13]` 0.900034
## `PubDate.date.fctr(13,19]` 0.584051
## `PubDate.date.fctr(19,25]` 0.582937
## `PubDate.date.fctr(25,31]` 0.699823
## `PubDate.second.fctr(14.8,29.5]` 0.637436
## `PubDate.second.fctr(29.5,44.2]` 0.914986
## `PubDate.second.fctr(44.2,59.1]` 0.252627
## H.npnct07.log 0.153483
## S.npnct07.log 0.998181
## S.npnct03.log 0.997285
## A.npnct18.log 0.997438
## H.npnct12.log 0.248099
## H.T.word 0.013995 *
## H.T.big 0.599130
## S.P.year.colon 0.997827
## S.T.obama 0.562423
## S.npnct20.log 0.997273
## H.npnct02.log 0.996988
## H.T.test 0.827055
## S.npnct14.log 0.602484
## H.P.on.this.day 0.997938
## S.P.first.draft 0.996969
## S.T.take 0.249852
## S.npnct06.log 0.647953
## S.T.time 0.247326
## H.T.newyorktim 0.879741
## H.npnct13.log 0.250931
## H.T.deal 0.992782
## S.T.new 0.983846
## H.T.billion 0.924099
## S.P.metropolitan.diary.colon 0.027489 *
## H.T.polit 0.227670
## H.P.verbatim.colon 0.996888
## H.T.china 0.362082
## H.T.art 0.250330
## `PubDate.minute.fctr(14.8,29.5]` 0.310315
## `PubDate.minute.fctr(29.5,44.2]` 0.217656
## `PubDate.minute.fctr(44.2,59.1]` 0.756201
## H.T.read 0.018459 *
## S.npnct12.log 0.461637
## A.T.year 0.777015
## A.T.will 0.267592
## S.T.appear 0.652865
## PubDate.wkday.fctr1 0.593901
## PubDate.wkday.fctr2 0.158574
## PubDate.wkday.fctr3 0.461250
## PubDate.wkday.fctr4 0.257217
## PubDate.wkday.fctr5 0.363265
## PubDate.wkday.fctr6 0.052471 .
## H.T.pictur 0.838754
## H.T.new 0.282327
## A.T.senat 0.384734
## S.T.show 0.275138
## H.P.today.in.smallbusiness 0.995767
## S.T.day 0.396137
## H.P.first.draft 0.994154
## S.npnct28.log 0.994256
## H.P.daily.clip.report 0.995230
## H.T.clip NA
## S.P.daily.clip.report NA
## A.T.first 0.320368
## H.T.news 0.419821
## H.T.X2014 0.550319
## A.T.newyork 0.033102 *
## A.T.report 0.177163
## A.T.compani 0.504928
## A.T.word 0.532127
## H.T.busi 0.490820
## A.T.newyorktim 0.085877 .
## A.npnct13.log 0.000809 ***
## S.T.share 0.122628
## A.T.articl 0.187436
## H.T.newyork 0.118363
## H.T.springsumm 0.991176
## H.T.day 0.532089
## S.T.diari 0.013190 *
## H.T.report 0.164212
## S.npnct04.log 0.112995
## S.T.herald 0.990396
## S.npnct15.log 0.888041
## H.T.week 0.256322
## A.T.photo 0.347719
## A.T.intern 0.246855
## S.T.tribun 0.991576
## S.P.fashion.week 0.998326
## S.T.archiv 0.989677
## H.P.fashion.week 0.988088
## H.npnct15.log 5.24e-05 ***
## A.T.fashion 0.984186
## A.T.week 0.838085
## H.nstopwrds.log 0.057494 .
## H.npnct28.log 0.412179
## S.npnct11.log 0.523182
## S.nstopwrds.log 0.199052
## H.ndgts.log 0.022788 *
## S.ndgts.log 0.306350
## H.nuppr.log 0.033943 *
## H.nwrds.log 0.717309
## S.nwrds.log 0.858556
## A.nchrs.log 0.795038
## A.nwrds.unq.log 0.164418
## S.nuppr.log 0.003509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 1623.0 on 4242 degrees of freedom
## AIC: 2089
##
## Number of Fisher Scoring iterations: 19
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
glb_get_predictions <- function(df, mdl_id, rsp_var_out, prob_threshold_def=NULL) {
mdl <- glb_models_lst[[mdl_id]]
rsp_var_out <- paste0(rsp_var_out, mdl_id)
if (glb_is_regression) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
print(myplot_scatter(df, glb_rsp_var, rsp_var_out, smooth=TRUE))
df[, paste0(rsp_var_out, ".err")] <-
abs(df[, rsp_var_out] - df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))),
df)))
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$model_id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")[, 2]
df[, rsp_var_out] <-
factor(levels(df[, glb_rsp_var])[
(df[, paste0(rsp_var_out, ".prob")] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# prediction stats already reported by myfit_mdl ???
}
if (glb_is_classification && !glb_is_binomial) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
}
return(df)
}
glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, glb_sel_mdl_id, glb_rsp_var_out)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
predct_accurate_var_name <- paste0(glb_rsp_var_out, glb_sel_mdl_id, ".accurate")
glb_OOBobs_df[, predct_accurate_var_name] <-
(glb_OOBobs_df[, glb_rsp_var] ==
glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)])
glb_feats_df <-
mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_sel_mdl, glb_fitobs_df)
glb_feats_df[, paste0(glb_sel_mdl_id, ".importance")] <- glb_feats_df$importance
print(glb_feats_df)
## id cor.y
## WordCount.log WordCount.log 2.656836e-01
## H.P.readers.respond H.P.readers.respond 4.432886e-02
## myCategory.fctr myCategory.fctr 1.234541e-02
## H.npnct19.log H.npnct19.log 1.283641e-01
## H.npnct15.log H.npnct15.log -8.273237e-02
## .clusterid.fctr .clusterid.fctr 1.813987e-01
## A.npnct13.log A.npnct13.log -4.999563e-02
## A.npnct19.log A.npnct19.log 5.482747e-02
## S.nuppr.log S.nuppr.log -2.718459e-01
## S.T.diari S.T.diari -6.229931e-02
## H.T.word H.T.word -1.382927e-02
## H.npnct08.log H.npnct08.log 5.375262e-02
## H.T.read H.T.read -3.467043e-02
## H.ndgts.log H.ndgts.log -1.196633e-01
## S.P.metropolitan.diary.colon S.P.metropolitan.diary.colon -2.841404e-02
## S.ratio.sum.TfIdf.nwrds S.ratio.sum.TfIdf.nwrds 2.622549e-01
## A.T.newyork A.T.newyork -4.686921e-02
## H.nuppr.log H.nuppr.log -1.278085e-01
## S.T.make S.T.make 4.118050e-02
## PubDate.wkday.fctr PubDate.wkday.fctr -3.980129e-02
## H.nstopwrds.log H.nstopwrds.log -8.657067e-02
## H.ratio.nstopwrds.nwrds H.ratio.nstopwrds.nwrds 4.024406e-02
## H.npnct11.log H.npnct11.log 1.333613e-02
## S.T.can S.T.can 3.005998e-02
## H.P.no.comment.colon H.P.no.comment.colon 6.074669e-02
## H.P.friday.night.music H.P.friday.night.music -9.653967e-03
## A.T.newyorktim A.T.newyorktim -4.984782e-02
## S.npnct04.log S.npnct04.log -6.294642e-02
## H.T.newyork H.T.newyork -5.564999e-02
## S.T.share S.T.share -5.105597e-02
## S.npnct08.log S.npnct08.log -3.372706e-03
## H.sum.TfIdf H.sum.TfIdf 1.520414e-01
## H.P.recap.colon H.P.recap.colon 9.008096e-02
## S.T.one S.T.one 1.050293e-02
## H.npnct07.log H.npnct07.log -1.201741e-02
## PubDate.last10.log PubDate.last10.log 4.931702e-02
## H.T.report H.T.report -6.238114e-02
## A.nwrds.unq.log A.nwrds.unq.log -2.460117e-01
## A.T.report A.T.report -4.774593e-02
## PubDate.hour.fctr PubDate.hour.fctr 1.354368e-01
## A.T.articl A.T.articl -5.470831e-02
## A.sum.TfIdf A.sum.TfIdf 1.478461e-01
## S.nstopwrds.log S.nstopwrds.log -1.148150e-01
## PubDate.minute.fctr PubDate.minute.fctr -3.407385e-02
## H.T.polit H.T.polit -3.058564e-02
## S.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.206896e-01
## A.T.intern A.T.intern -6.953025e-02
## S.T.time S.T.time -2.416246e-02
## H.npnct12.log H.npnct12.log -1.305305e-02
## S.T.take S.T.take -2.264447e-02
## H.T.art H.T.art -3.291486e-02
## H.npnct13.log H.npnct13.log -2.524770e-02
## PubDate.second.fctr PubDate.second.fctr -1.187946e-02
## H.T.week H.T.week -6.827601e-02
## H.T.get H.T.get 3.300192e-02
## S.npnct01.log S.npnct01.log 3.093101e-02
## A.T.will A.T.will -3.884318e-02
## S.T.show S.T.show -4.182920e-02
## H.T.new H.T.new -4.111696e-02
## .rnorm .rnorm -8.244230e-03
## H.ratio.sum.TfIdf.nwrds H.ratio.sum.TfIdf.nwrds 2.254527e-01
## S.ndgts.log S.ndgts.log -1.242046e-01
## H.T.say H.T.say -9.763205e-03
## A.T.first A.T.first -4.433630e-02
## A.T.photo A.T.photo -6.873838e-02
## H.T.china H.T.china -3.283653e-02
## H.npnct01.log H.npnct01.log 2.271577e-02
## H.T.make H.T.make 1.349595e-02
## A.T.senat A.T.senat -4.139980e-02
## S.T.said S.T.said 1.863436e-02
## S.T.day S.T.day -4.262213e-02
## H.npnct28.log H.npnct28.log -8.917338e-02
## H.T.news H.T.news -4.436368e-02
## H.npnct16.log H.npnct16.log 3.039622e-02
## H.T.take H.T.take -1.263270e-03
## S.npnct12.log S.npnct12.log -3.638891e-02
## H.T.busi H.T.busi -4.899819e-02
## A.T.compani A.T.compani -4.774812e-02
## S.npnct11.log S.npnct11.log -9.158156e-02
## H.T.day H.T.day -6.044381e-02
## A.T.word A.T.word -4.821561e-02
## H.P.facts.figures H.P.facts.figures 5.410097e-02
## H.T.X2014 H.T.X2014 -4.523858e-02
## PubDate.last1.log PubDate.last1.log 4.635751e-02
## S.T.obama S.T.obama -1.914281e-02
## PubDate.date.fctr PubDate.date.fctr -1.164756e-02
## H.T.big H.T.big -1.438162e-02
## S.npnct14.log S.npnct14.log -2.121844e-02
## A.npnct16.log A.npnct16.log -1.587454e-03
## S.npnct06.log S.npnct06.log -2.389145e-02
## S.T.appear S.T.appear -3.941362e-02
## PubDate.last100.log PubDate.last100.log -7.663322e-03
## PubDate.wkend PubDate.wkend 1.067288e-01
## H.T.ebola H.T.ebola 2.682920e-02
## H.nwrds.log H.nwrds.log -1.573431e-01
## H.T.obama H.T.obama -9.878461e-03
## A.T.year A.T.year -3.741571e-02
## A.nchrs.log A.nchrs.log -2.245488e-01
## H.T.test H.T.test -2.117852e-02
## A.T.week A.T.week -8.542792e-02
## H.T.pictur H.T.pictur -4.003882e-02
## S.nwrds.log S.nwrds.log -1.978341e-01
## H.T.newyorktim H.T.newyorktim -2.514415e-02
## S.npnct15.log S.npnct15.log -6.770952e-02
## H.T.bank H.T.bank -1.037439e-02
## H.T.billion H.T.billion -2.776561e-02
## S.T.new S.T.new -2.592872e-02
## A.T.fashion A.T.fashion -8.416793e-02
## H.P.fashion.week H.P.fashion.week -7.632046e-02
## S.T.archiv S.T.archiv -7.202808e-02
## S.T.herald S.T.herald -6.752419e-02
## H.T.springsumm H.T.springsumm -5.943248e-02
## S.T.tribun S.T.tribun -7.013418e-02
## H.T.deal H.T.deal -2.556237e-02
## H.P.first.draft H.P.first.draft -4.316253e-02
## S.npnct28.log S.npnct28.log -4.370037e-02
## H.P.daily.clip.report H.P.daily.clip.report -4.388279e-02
## H.P.today.in.smallbusiness H.P.today.in.smallbusiness -4.243051e-02
## H.P.verbatim.colon H.P.verbatim.colon -3.194363e-02
## S.P.first.draft S.P.first.draft -2.150663e-02
## H.npnct02.log H.npnct02.log -2.001851e-02
## H.P.quandary H.P.quandary 8.734922e-02
## S.npnct20.log S.npnct20.log -1.923169e-02
## S.npnct03.log S.npnct03.log -1.240734e-02
## A.npnct18.log A.npnct18.log -1.271661e-02
## A.T.presid A.T.presid -2.090565e-03
## S.T.presid S.T.presid -2.381159e-03
## S.P.year.colon S.P.year.colon -1.755336e-02
## H.P.on.this.day H.P.on.this.day -2.150663e-02
## H.npnct05.log H.npnct05.log -9.653967e-03
## S.npnct07.log S.npnct07.log -1.214357e-02
## S.P.fashion.week S.P.fashion.week -7.080716e-02
## H.P.s.notebook H.P.s.notebook 7.755542e-03
## .clusterid .clusterid 1.820567e-01
## A.ndgts.log A.ndgts.log -1.249484e-01
## A.npnct01.log A.npnct01.log 3.093101e-02
## A.npnct02.log A.npnct02.log -1.451467e-02
## A.npnct03.log A.npnct03.log -1.359260e-02
## A.npnct04.log A.npnct04.log -6.294642e-02
## A.npnct05.log A.npnct05.log NA
## A.npnct06.log A.npnct06.log -2.389145e-02
## A.npnct07.log A.npnct07.log -1.214357e-02
## A.npnct08.log A.npnct08.log -4.193476e-03
## A.npnct09.log A.npnct09.log NA
## A.npnct10.log A.npnct10.log -5.547032e-03
## A.npnct11.log A.npnct11.log -9.183870e-02
## A.npnct12.log A.npnct12.log -3.760012e-02
## A.npnct14.log A.npnct14.log -2.407715e-02
## A.npnct15.log A.npnct15.log -6.893301e-02
## A.npnct17.log A.npnct17.log -1.457558e-02
## A.npnct20.log A.npnct20.log -1.923169e-02
## A.npnct21.log A.npnct21.log 1.537569e-02
## A.npnct22.log A.npnct22.log NA
## A.npnct23.log A.npnct23.log 1.537569e-02
## A.npnct24.log A.npnct24.log -9.890046e-19
## A.npnct25.log A.npnct25.log -5.547032e-03
## A.npnct26.log A.npnct26.log NA
## A.npnct27.log A.npnct27.log NA
## A.npnct28.log A.npnct28.log -4.373349e-02
## A.npnct29.log A.npnct29.log NA
## A.npnct30.log A.npnct30.log NA
## A.nstopwrds.log A.nstopwrds.log -1.153879e-01
## A.nuppr.log A.nuppr.log -2.720962e-01
## A.nwrds.log A.nwrds.log -1.978712e-01
## A.P.daily.clip.report A.P.daily.clip.report -4.388279e-02
## A.P.fashion.week A.P.fashion.week -7.080716e-02
## A.P.first.draft A.P.first.draft -2.150663e-02
## A.P.http A.P.http -1.294748e-02
## A.P.metropolitan.diary.colon A.P.metropolitan.diary.colon -2.841404e-02
## A.P.year.colon A.P.year.colon -1.755336e-02
## A.ratio.nstopwrds.nwrds A.ratio.nstopwrds.nwrds 1.213545e-01
## A.ratio.sum.TfIdf.nwrds A.ratio.sum.TfIdf.nwrds 2.623865e-01
## A.T.appear A.T.appear -3.941362e-02
## A.T.archiv A.T.archiv -7.202808e-02
## A.T.can A.T.can 3.083389e-02
## A.T.day A.T.day -4.270831e-02
## A.T.diari A.T.diari -6.229931e-02
## A.T.herald A.T.herald -6.752419e-02
## A.T.make A.T.make 4.124187e-02
## A.T.new A.T.new -2.597887e-02
## A.T.obama A.T.obama -1.914924e-02
## A.T.one A.T.one 1.051414e-02
## A.T.said A.T.said 1.876762e-02
## A.T.share A.T.share -5.105597e-02
## A.T.show A.T.show -4.185292e-02
## A.T.take A.T.take -2.271897e-02
## A.T.time A.T.time -2.430509e-02
## A.T.tribun A.T.tribun -7.013418e-02
## H.nchrs.log H.nchrs.log -1.710624e-01
## H.npnct03.log H.npnct03.log 9.533020e-03
## H.npnct04.log H.npnct04.log -5.126277e-02
## H.npnct06.log H.npnct06.log 3.190718e-02
## H.npnct09.log H.npnct09.log NA
## H.npnct10.log H.npnct10.log -5.547032e-03
## H.npnct14.log H.npnct14.log -6.158577e-02
## H.npnct17.log H.npnct17.log NA
## H.npnct18.log H.npnct18.log NA
## H.npnct20.log H.npnct20.log -5.547032e-03
## H.npnct21.log H.npnct21.log NA
## H.npnct22.log H.npnct22.log NA
## H.npnct23.log H.npnct23.log NA
## H.npnct24.log H.npnct24.log -9.890046e-19
## H.npnct25.log H.npnct25.log NA
## H.npnct26.log H.npnct26.log NA
## H.npnct27.log H.npnct27.log NA
## H.npnct29.log H.npnct29.log NA
## H.npnct30.log H.npnct30.log NA
## H.nwrds.unq.log H.nwrds.unq.log -2.014127e-01
## H.P.http H.P.http NA
## H.P.today.in.politic H.P.today.in.politic -3.733661e-02
## H.P.what.we.are H.P.what.we.are -3.775209e-02
## H.P.year.colon H.P.year.colon -7.842875e-02
## H.T.clip H.T.clip -4.388279e-02
## H.T.daili H.T.daili -6.303731e-02
## H.T.fashion H.T.fashion -7.947505e-02
## H.T.first H.T.first -4.472902e-02
## H.T.morn H.T.morn -4.838380e-02
## H.T.today H.T.today -5.833786e-02
## H.T.X2015 H.T.X2015 -6.601141e-02
## Popular Popular 1.000000e+00
## Popular.fctr Popular.fctr NA
## PubDate.last1 PubDate.last1 3.592267e-02
## PubDate.last10 PubDate.last10 5.398093e-02
## PubDate.last100 PubDate.last100 3.989229e-02
## PubDate.month.fctr PubDate.month.fctr 1.914874e-02
## PubDate.POSIX PubDate.POSIX 1.568326e-02
## PubDate.year.fctr PubDate.year.fctr NA
## PubDate.zoo PubDate.zoo 1.568326e-02
## S.nchrs.log S.nchrs.log -2.246930e-01
## S.npnct02.log S.npnct02.log -5.547032e-03
## S.npnct05.log S.npnct05.log NA
## S.npnct09.log S.npnct09.log NA
## S.npnct10.log S.npnct10.log -5.547032e-03
## S.npnct13.log S.npnct13.log -5.332519e-02
## S.npnct16.log S.npnct16.log -1.587454e-03
## S.npnct17.log S.npnct17.log NA
## S.npnct18.log S.npnct18.log NA
## S.npnct19.log S.npnct19.log 5.503894e-02
## S.npnct21.log S.npnct21.log 2.760321e-02
## S.npnct22.log S.npnct22.log NA
## S.npnct23.log S.npnct23.log 2.760321e-02
## S.npnct24.log S.npnct24.log -9.890046e-19
## S.npnct25.log S.npnct25.log NA
## S.npnct26.log S.npnct26.log NA
## S.npnct27.log S.npnct27.log NA
## S.npnct29.log S.npnct29.log NA
## S.npnct30.log S.npnct30.log NA
## S.nwrds.unq.log S.nwrds.unq.log -2.461670e-01
## S.P.daily.clip.report S.P.daily.clip.report -4.388279e-02
## S.P.http S.P.http NA
## S.sum.TfIdf S.sum.TfIdf 1.484963e-01
## S.T.articl S.T.articl -5.471737e-02
## S.T.compani S.T.compani -4.787994e-02
## S.T.fashion S.T.fashion -8.417159e-02
## S.T.first S.T.first -4.447317e-02
## S.T.intern S.T.intern -6.956906e-02
## S.T.newyork S.T.newyork -4.694998e-02
## S.T.newyorktim S.T.newyorktim -4.985328e-02
## S.T.photo S.T.photo -6.874283e-02
## S.T.report S.T.report -4.779877e-02
## S.T.senat S.T.senat -4.143422e-02
## S.T.week S.T.week -8.552704e-02
## S.T.will S.T.will -3.888838e-02
## S.T.word S.T.word -4.822452e-02
## S.T.year S.T.year -3.756011e-02
## UniqueID UniqueID 1.182492e-02
## WordCount WordCount 2.575265e-01
## exclude.as.feat cor.y.abs
## WordCount.log FALSE 2.656836e-01
## H.P.readers.respond FALSE 4.432886e-02
## myCategory.fctr FALSE 1.234541e-02
## H.npnct19.log FALSE 1.283641e-01
## H.npnct15.log FALSE 8.273237e-02
## .clusterid.fctr FALSE 1.813987e-01
## A.npnct13.log FALSE 4.999563e-02
## A.npnct19.log FALSE 5.482747e-02
## S.nuppr.log FALSE 2.718459e-01
## S.T.diari FALSE 6.229931e-02
## H.T.word FALSE 1.382927e-02
## H.npnct08.log FALSE 5.375262e-02
## H.T.read FALSE 3.467043e-02
## H.ndgts.log FALSE 1.196633e-01
## S.P.metropolitan.diary.colon FALSE 2.841404e-02
## S.ratio.sum.TfIdf.nwrds FALSE 2.622549e-01
## A.T.newyork FALSE 4.686921e-02
## H.nuppr.log FALSE 1.278085e-01
## S.T.make FALSE 4.118050e-02
## PubDate.wkday.fctr FALSE 3.980129e-02
## H.nstopwrds.log FALSE 8.657067e-02
## H.ratio.nstopwrds.nwrds FALSE 4.024406e-02
## H.npnct11.log FALSE 1.333613e-02
## S.T.can FALSE 3.005998e-02
## H.P.no.comment.colon FALSE 6.074669e-02
## H.P.friday.night.music FALSE 9.653967e-03
## A.T.newyorktim FALSE 4.984782e-02
## S.npnct04.log FALSE 6.294642e-02
## H.T.newyork FALSE 5.564999e-02
## S.T.share FALSE 5.105597e-02
## S.npnct08.log FALSE 3.372706e-03
## H.sum.TfIdf FALSE 1.520414e-01
## H.P.recap.colon FALSE 9.008096e-02
## S.T.one FALSE 1.050293e-02
## H.npnct07.log FALSE 1.201741e-02
## PubDate.last10.log FALSE 4.931702e-02
## H.T.report FALSE 6.238114e-02
## A.nwrds.unq.log FALSE 2.460117e-01
## A.T.report FALSE 4.774593e-02
## PubDate.hour.fctr FALSE 1.354368e-01
## A.T.articl FALSE 5.470831e-02
## A.sum.TfIdf FALSE 1.478461e-01
## S.nstopwrds.log FALSE 1.148150e-01
## PubDate.minute.fctr FALSE 3.407385e-02
## H.T.polit FALSE 3.058564e-02
## S.ratio.nstopwrds.nwrds FALSE 1.206896e-01
## A.T.intern FALSE 6.953025e-02
## S.T.time FALSE 2.416246e-02
## H.npnct12.log FALSE 1.305305e-02
## S.T.take FALSE 2.264447e-02
## H.T.art FALSE 3.291486e-02
## H.npnct13.log FALSE 2.524770e-02
## PubDate.second.fctr FALSE 1.187946e-02
## H.T.week FALSE 6.827601e-02
## H.T.get FALSE 3.300192e-02
## S.npnct01.log FALSE 3.093101e-02
## A.T.will FALSE 3.884318e-02
## S.T.show FALSE 4.182920e-02
## H.T.new FALSE 4.111696e-02
## .rnorm FALSE 8.244230e-03
## H.ratio.sum.TfIdf.nwrds FALSE 2.254527e-01
## S.ndgts.log FALSE 1.242046e-01
## H.T.say FALSE 9.763205e-03
## A.T.first FALSE 4.433630e-02
## A.T.photo FALSE 6.873838e-02
## H.T.china FALSE 3.283653e-02
## H.npnct01.log FALSE 2.271577e-02
## H.T.make FALSE 1.349595e-02
## A.T.senat FALSE 4.139980e-02
## S.T.said FALSE 1.863436e-02
## S.T.day FALSE 4.262213e-02
## H.npnct28.log FALSE 8.917338e-02
## H.T.news FALSE 4.436368e-02
## H.npnct16.log FALSE 3.039622e-02
## H.T.take FALSE 1.263270e-03
## S.npnct12.log FALSE 3.638891e-02
## H.T.busi FALSE 4.899819e-02
## A.T.compani FALSE 4.774812e-02
## S.npnct11.log FALSE 9.158156e-02
## H.T.day FALSE 6.044381e-02
## A.T.word FALSE 4.821561e-02
## H.P.facts.figures FALSE 5.410097e-02
## H.T.X2014 FALSE 4.523858e-02
## PubDate.last1.log FALSE 4.635751e-02
## S.T.obama FALSE 1.914281e-02
## PubDate.date.fctr FALSE 1.164756e-02
## H.T.big FALSE 1.438162e-02
## S.npnct14.log FALSE 2.121844e-02
## A.npnct16.log FALSE 1.587454e-03
## S.npnct06.log FALSE 2.389145e-02
## S.T.appear FALSE 3.941362e-02
## PubDate.last100.log FALSE 7.663322e-03
## PubDate.wkend FALSE 1.067288e-01
## H.T.ebola FALSE 2.682920e-02
## H.nwrds.log FALSE 1.573431e-01
## H.T.obama FALSE 9.878461e-03
## A.T.year FALSE 3.741571e-02
## A.nchrs.log FALSE 2.245488e-01
## H.T.test FALSE 2.117852e-02
## A.T.week FALSE 8.542792e-02
## H.T.pictur FALSE 4.003882e-02
## S.nwrds.log FALSE 1.978341e-01
## H.T.newyorktim FALSE 2.514415e-02
## S.npnct15.log FALSE 6.770952e-02
## H.T.bank FALSE 1.037439e-02
## H.T.billion FALSE 2.776561e-02
## S.T.new FALSE 2.592872e-02
## A.T.fashion FALSE 8.416793e-02
## H.P.fashion.week FALSE 7.632046e-02
## S.T.archiv FALSE 7.202808e-02
## S.T.herald FALSE 6.752419e-02
## H.T.springsumm FALSE 5.943248e-02
## S.T.tribun FALSE 7.013418e-02
## H.T.deal FALSE 2.556237e-02
## H.P.first.draft FALSE 4.316253e-02
## S.npnct28.log FALSE 4.370037e-02
## H.P.daily.clip.report FALSE 4.388279e-02
## H.P.today.in.smallbusiness FALSE 4.243051e-02
## H.P.verbatim.colon FALSE 3.194363e-02
## S.P.first.draft FALSE 2.150663e-02
## H.npnct02.log FALSE 2.001851e-02
## H.P.quandary FALSE 8.734922e-02
## S.npnct20.log FALSE 1.923169e-02
## S.npnct03.log FALSE 1.240734e-02
## A.npnct18.log FALSE 1.271661e-02
## A.T.presid FALSE 2.090565e-03
## S.T.presid FALSE 2.381159e-03
## S.P.year.colon FALSE 1.755336e-02
## H.P.on.this.day FALSE 2.150663e-02
## H.npnct05.log FALSE 9.653967e-03
## S.npnct07.log FALSE 1.214357e-02
## S.P.fashion.week FALSE 7.080716e-02
## H.P.s.notebook FALSE 7.755542e-03
## .clusterid TRUE 1.820567e-01
## A.ndgts.log FALSE 1.249484e-01
## A.npnct01.log FALSE 3.093101e-02
## A.npnct02.log FALSE 1.451467e-02
## A.npnct03.log FALSE 1.359260e-02
## A.npnct04.log FALSE 6.294642e-02
## A.npnct05.log FALSE NA
## A.npnct06.log FALSE 2.389145e-02
## A.npnct07.log FALSE 1.214357e-02
## A.npnct08.log FALSE 4.193476e-03
## A.npnct09.log FALSE NA
## A.npnct10.log FALSE 5.547032e-03
## A.npnct11.log FALSE 9.183870e-02
## A.npnct12.log FALSE 3.760012e-02
## A.npnct14.log FALSE 2.407715e-02
## A.npnct15.log FALSE 6.893301e-02
## A.npnct17.log FALSE 1.457558e-02
## A.npnct20.log FALSE 1.923169e-02
## A.npnct21.log FALSE 1.537569e-02
## A.npnct22.log FALSE NA
## A.npnct23.log FALSE 1.537569e-02
## A.npnct24.log FALSE 9.890046e-19
## A.npnct25.log FALSE 5.547032e-03
## A.npnct26.log FALSE NA
## A.npnct27.log FALSE NA
## A.npnct28.log FALSE 4.373349e-02
## A.npnct29.log FALSE NA
## A.npnct30.log FALSE NA
## A.nstopwrds.log FALSE 1.153879e-01
## A.nuppr.log FALSE 2.720962e-01
## A.nwrds.log FALSE 1.978712e-01
## A.P.daily.clip.report FALSE 4.388279e-02
## A.P.fashion.week FALSE 7.080716e-02
## A.P.first.draft FALSE 2.150663e-02
## A.P.http FALSE 1.294748e-02
## A.P.metropolitan.diary.colon FALSE 2.841404e-02
## A.P.year.colon FALSE 1.755336e-02
## A.ratio.nstopwrds.nwrds FALSE 1.213545e-01
## A.ratio.sum.TfIdf.nwrds FALSE 2.623865e-01
## A.T.appear FALSE 3.941362e-02
## A.T.archiv FALSE 7.202808e-02
## A.T.can FALSE 3.083389e-02
## A.T.day FALSE 4.270831e-02
## A.T.diari FALSE 6.229931e-02
## A.T.herald FALSE 6.752419e-02
## A.T.make FALSE 4.124187e-02
## A.T.new FALSE 2.597887e-02
## A.T.obama FALSE 1.914924e-02
## A.T.one FALSE 1.051414e-02
## A.T.said FALSE 1.876762e-02
## A.T.share FALSE 5.105597e-02
## A.T.show FALSE 4.185292e-02
## A.T.take FALSE 2.271897e-02
## A.T.time FALSE 2.430509e-02
## A.T.tribun FALSE 7.013418e-02
## H.nchrs.log FALSE 1.710624e-01
## H.npnct03.log FALSE 9.533020e-03
## H.npnct04.log FALSE 5.126277e-02
## H.npnct06.log FALSE 3.190718e-02
## H.npnct09.log FALSE NA
## H.npnct10.log FALSE 5.547032e-03
## H.npnct14.log FALSE 6.158577e-02
## H.npnct17.log FALSE NA
## H.npnct18.log FALSE NA
## H.npnct20.log FALSE 5.547032e-03
## H.npnct21.log FALSE NA
## H.npnct22.log FALSE NA
## H.npnct23.log FALSE NA
## H.npnct24.log FALSE 9.890046e-19
## H.npnct25.log FALSE NA
## H.npnct26.log FALSE NA
## H.npnct27.log FALSE NA
## H.npnct29.log FALSE NA
## H.npnct30.log FALSE NA
## H.nwrds.unq.log FALSE 2.014127e-01
## H.P.http FALSE NA
## H.P.today.in.politic FALSE 3.733661e-02
## H.P.what.we.are FALSE 3.775209e-02
## H.P.year.colon FALSE 7.842875e-02
## H.T.clip FALSE 4.388279e-02
## H.T.daili FALSE 6.303731e-02
## H.T.fashion FALSE 7.947505e-02
## H.T.first FALSE 4.472902e-02
## H.T.morn FALSE 4.838380e-02
## H.T.today FALSE 5.833786e-02
## H.T.X2015 FALSE 6.601141e-02
## Popular TRUE 1.000000e+00
## Popular.fctr TRUE NA
## PubDate.last1 TRUE 3.592267e-02
## PubDate.last10 TRUE 5.398093e-02
## PubDate.last100 TRUE 3.989229e-02
## PubDate.month.fctr TRUE 1.914874e-02
## PubDate.POSIX TRUE 1.568326e-02
## PubDate.year.fctr FALSE NA
## PubDate.zoo TRUE 1.568326e-02
## S.nchrs.log FALSE 2.246930e-01
## S.npnct02.log FALSE 5.547032e-03
## S.npnct05.log FALSE NA
## S.npnct09.log FALSE NA
## S.npnct10.log FALSE 5.547032e-03
## S.npnct13.log FALSE 5.332519e-02
## S.npnct16.log FALSE 1.587454e-03
## S.npnct17.log FALSE NA
## S.npnct18.log FALSE NA
## S.npnct19.log FALSE 5.503894e-02
## S.npnct21.log FALSE 2.760321e-02
## S.npnct22.log FALSE NA
## S.npnct23.log FALSE 2.760321e-02
## S.npnct24.log FALSE 9.890046e-19
## S.npnct25.log FALSE NA
## S.npnct26.log FALSE NA
## S.npnct27.log FALSE NA
## S.npnct29.log FALSE NA
## S.npnct30.log FALSE NA
## S.nwrds.unq.log FALSE 2.461670e-01
## S.P.daily.clip.report FALSE 4.388279e-02
## S.P.http FALSE NA
## S.sum.TfIdf FALSE 1.484963e-01
## S.T.articl FALSE 5.471737e-02
## S.T.compani FALSE 4.787994e-02
## S.T.fashion FALSE 8.417159e-02
## S.T.first FALSE 4.447317e-02
## S.T.intern FALSE 6.956906e-02
## S.T.newyork FALSE 4.694998e-02
## S.T.newyorktim FALSE 4.985328e-02
## S.T.photo FALSE 6.874283e-02
## S.T.report FALSE 4.779877e-02
## S.T.senat FALSE 4.143422e-02
## S.T.week FALSE 8.552704e-02
## S.T.will FALSE 3.888838e-02
## S.T.word FALSE 4.822452e-02
## S.T.year FALSE 3.756011e-02
## UniqueID TRUE 1.182492e-02
## WordCount TRUE 2.575265e-01
## cor.high.X freqRatio
## WordCount.log <NA> 1.300000
## H.P.readers.respond <NA> 342.789474
## myCategory.fctr <NA> 1.337185
## H.npnct19.log <NA> 14.995098
## H.npnct15.log <NA> 3.914910
## .clusterid.fctr <NA> 16.410959
## A.npnct13.log <NA> 4.603330
## A.npnct19.log <NA> 12.798715
## S.nuppr.log <NA> 1.152620
## S.T.diari <NA> 71.528090
## H.T.word <NA> 104.096774
## H.npnct08.log <NA> 111.620690
## H.T.read <NA> 179.388889
## H.ndgts.log <NA> 13.616137
## S.P.metropolitan.diary.colon <NA> 99.492308
## S.ratio.sum.TfIdf.nwrds <NA> 2.583333
## A.T.newyork <NA> 149.547619
## H.nuppr.log <NA> 1.033930
## S.T.make <NA> 273.782609
## PubDate.wkday.fctr <NA> 1.003268
## H.nstopwrds.log <NA> 1.370729
## H.ratio.nstopwrds.nwrds <NA> 1.141631
## H.npnct11.log <NA> 4.937442
## S.T.can <NA> 261.666667
## H.P.no.comment.colon <NA> 724.777778
## H.P.friday.night.music <NA> 543.333333
## A.T.newyorktim <NA> 84.540541
## S.npnct04.log <NA> 28.536364
## H.T.newyork <NA> 112.517857
## S.T.share <NA> 234.629630
## S.npnct08.log <NA> 175.486486
## H.sum.TfIdf <NA> 1.127273
## H.P.recap.colon <NA> 93.666667
## S.T.one <NA> 214.965517
## H.npnct07.log <NA> 5.437234
## PubDate.last10.log <NA> 1.666667
## H.T.report <NA> 102.000000
## A.nwrds.unq.log <NA> 1.054206
## A.T.report <NA> 80.371795
## PubDate.hour.fctr <NA> 1.835040
## A.T.articl <NA> 85.500000
## A.sum.TfIdf <NA> 2.583333
## S.nstopwrds.log <NA> 1.097879
## PubDate.minute.fctr <NA> 1.483365
## H.T.polit <NA> 128.780000
## S.ratio.nstopwrds.nwrds <NA> 1.908517
## A.T.intern <NA> 140.400000
## S.T.time <NA> 217.862069
## H.npnct12.log <NA> 13.126638
## S.T.take <NA> 274.608696
## H.T.art <NA> 293.363636
## H.npnct13.log <NA> 22.802326
## PubDate.second.fctr <NA> 1.018204
## H.T.week <NA> 71.352273
## H.T.get <NA> 430.866667
## S.npnct01.log <NA> 309.952381
## A.T.will <NA> 121.734694
## S.T.show <NA> 274.608696
## H.T.new <NA> 123.333333
## .rnorm <NA> 2.000000
## H.ratio.sum.TfIdf.nwrds <NA> 1.148148
## S.ndgts.log <NA> 10.511247
## H.T.say <NA> 247.461538
## A.T.first <NA> 225.250000
## A.T.photo <NA> 70.400000
## H.T.china <NA> 238.407407
## H.npnct01.log <NA> 282.913043
## H.T.make <NA> 322.200000
## A.T.senat <NA> 372.294118
## S.T.said <NA> 202.516129
## S.T.day <NA> 89.528571
## H.npnct28.log <NA> 24.123077
## H.T.news <NA> 322.000000
## H.npnct16.log <NA> 96.104478
## H.T.take <NA> 322.250000
## S.npnct12.log <NA> 5.706263
## H.T.busi <NA> 229.428571
## A.T.compani <NA> 137.111111
## S.npnct11.log <NA> 1.660473
## H.T.day <NA> 86.547945
## A.T.word <NA> 133.125000
## H.P.facts.figures <NA> 1087.666667
## H.T.X2014 <NA> 110.879310
## PubDate.last1.log <NA> 1.142857
## S.T.obama <NA> 398.625000
## PubDate.date.fctr <NA> 1.021394
## H.T.big <NA> 403.562500
## S.npnct14.log <NA> 203.062500
## A.npnct16.log <NA> 434.133333
## S.npnct06.log <NA> 115.642857
## S.T.appear <NA> 228.821429
## PubDate.last100.log <NA> 25.000000
## PubDate.wkend <NA> 9.095827
## H.T.ebola <NA> 293.000000
## H.nwrds.log <NA> 1.104308
## H.T.obama <NA> 229.750000
## A.T.year <NA> 160.815789
## A.nchrs.log <NA> 1.328571
## H.T.test <NA> 306.666667
## A.T.week <NA> 56.560748
## H.T.pictur <NA> 99.230769
## S.nwrds.log <NA> 1.049342
## H.T.newyorktim <NA> 433.266667
## S.npnct15.log <NA> 13.647191
## H.T.bank <NA> 214.300000
## H.T.billion <NA> 214.533333
## S.T.new <NA> 114.423077
## A.T.fashion <NA> 59.245283
## H.P.fashion.week <NA> 34.500000
## S.T.archiv <NA> 144.545455
## S.T.herald <NA> 144.750000
## H.T.springsumm <NA> 106.966667
## S.T.tribun <NA> 144.750000
## H.T.deal <NA> 230.428571
## H.P.first.draft <NA> 107.866667
## S.npnct28.log <NA> 134.791667
## H.P.daily.clip.report <NA> 104.354839
## H.P.today.in.smallbusiness <NA> 111.620690
## H.P.verbatim.colon <NA> 196.939394
## S.P.first.draft <NA> 434.466667
## H.npnct02.log <NA> 501.461538
## H.P.quandary <NA> 652.200000
## S.npnct20.log <NA> 543.333333
## S.npnct03.log <NA> 1305.400000
## A.npnct18.log <NA> 1631.500000
## A.T.presid <NA> 232.740741
## S.T.presid <NA> 232.740741
## S.P.year.colon <NA> 652.200000
## H.P.on.this.day <NA> 434.466667
## H.npnct05.log <NA> 543.333333
## S.npnct07.log <NA> 1631.750000
## S.P.fashion.week <NA> 40.081761
## H.P.s.notebook <NA> 815.500000
## .clusterid <NA> 16.410959
## A.ndgts.log S.ndgts.log 10.501022
## A.npnct01.log S.npnct01.log 309.952381
## A.npnct02.log A.P.http 1087.500000
## A.npnct03.log S.npnct03.log 1087.666667
## A.npnct04.log S.npnct04.log 28.536364
## A.npnct05.log <NA> 0.000000
## A.npnct06.log S.npnct06.log 115.642857
## A.npnct07.log S.npnct07.log 1631.750000
## A.npnct08.log <NA> 170.842105
## A.npnct09.log <NA> 0.000000
## A.npnct10.log <NA> 6531.000000
## A.npnct11.log S.npnct11.log 1.660473
## A.npnct12.log S.npnct12.log 5.715368
## A.npnct14.log A.npnct17.log 196.696970
## A.npnct15.log S.npnct15.log 13.482222
## A.npnct17.log A.npnct02.log 1087.500000
## A.npnct20.log S.npnct20.log 543.333333
## A.npnct21.log A.npnct23.log 3264.500000
## A.npnct22.log <NA> 0.000000
## A.npnct23.log <NA> 3264.500000
## A.npnct24.log <NA> 0.000000
## A.npnct25.log <NA> 6531.000000
## A.npnct26.log <NA> 0.000000
## A.npnct27.log <NA> 0.000000
## A.npnct28.log S.npnct28.log 126.862745
## A.npnct29.log <NA> 0.000000
## A.npnct30.log <NA> 0.000000
## A.nstopwrds.log S.nstopwrds.log 1.096091
## A.nuppr.log S.nuppr.log 1.151308
## A.nwrds.log S.nwrds.log 1.052805
## A.P.daily.clip.report H.T.clip 104.354839
## A.P.fashion.week S.P.fashion.week 40.081761
## A.P.first.draft S.P.first.draft 434.466667
## A.P.http A.npnct18.log 1305.200000
## A.P.metropolitan.diary.colon S.P.metropolitan.diary.colon 99.492308
## A.P.year.colon S.P.year.colon 652.200000
## A.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.915094
## A.ratio.sum.TfIdf.nwrds A.nstopwrds.log 2.583333
## A.T.appear H.T.word 228.821429
## A.T.archiv S.T.intern 144.545455
## A.T.can S.T.can 261.666667
## A.T.day S.T.day 89.514286
## A.T.diari S.T.diari 71.528090
## A.T.herald S.T.herald 144.750000
## A.T.make S.T.make 273.782609
## A.T.new S.T.new 114.403846
## A.T.obama S.T.obama 398.625000
## A.T.one S.T.one 214.931034
## A.T.said S.T.said 202.516129
## A.T.share S.T.share 234.629630
## A.T.show S.T.show 263.166667
## A.T.take S.T.take 274.565217
## A.T.time S.T.time 217.827586
## A.T.tribun A.T.herald 144.750000
## H.nchrs.log H.nwrds.log 1.023810
## H.npnct03.log <NA> 2176.333333
## H.npnct04.log H.T.billion 38.325301
## H.npnct06.log H.npnct16.log 68.935484
## H.npnct09.log <NA> 0.000000
## H.npnct10.log <NA> 6531.000000
## H.npnct14.log H.T.springsumm 52.983471
## H.npnct17.log <NA> 0.000000
## H.npnct18.log <NA> 0.000000
## H.npnct20.log <NA> 6531.000000
## H.npnct21.log <NA> 0.000000
## H.npnct22.log <NA> 0.000000
## H.npnct23.log <NA> 0.000000
## H.npnct24.log <NA> 0.000000
## H.npnct25.log <NA> 0.000000
## H.npnct26.log <NA> 0.000000
## H.npnct27.log <NA> 0.000000
## H.npnct29.log <NA> 0.000000
## H.npnct30.log <NA> 0.000000
## H.nwrds.unq.log H.nuppr.log 1.000000
## H.P.http <NA> 0.000000
## H.P.today.in.politic H.T.polit 144.155556
## H.P.what.we.are H.T.read 141.000000
## H.P.year.colon A.T.archiv 32.670103
## H.T.clip <NA> 104.354839
## H.T.daili H.T.report 102.903226
## H.T.fashion H.P.fashion.week 76.926829
## H.T.first H.P.first.draft 194.727273
## H.T.morn A.npnct28.log 165.205128
## H.T.today H.P.today.in.politic 138.239130
## H.T.X2015 A.T.diari 96.833333
## Popular <NA> 4.976212
## Popular.fctr <NA> NA
## PubDate.last1 <NA> 1.142857
## PubDate.last10 <NA> 1.666667
## PubDate.last100 <NA> 25.000000
## PubDate.month.fctr <NA> 1.017514
## PubDate.POSIX <NA> 1.000000
## PubDate.year.fctr <NA> 0.000000
## PubDate.zoo <NA> 1.000000
## S.nchrs.log A.nwrds.log 1.328571
## S.npnct02.log <NA> 6531.000000
## S.npnct05.log <NA> 0.000000
## S.npnct09.log <NA> 0.000000
## S.npnct10.log <NA> 6531.000000
## S.npnct13.log A.npnct13.log 4.672000
## S.npnct16.log <NA> 434.133333
## S.npnct17.log <NA> 0.000000
## S.npnct18.log <NA> 0.000000
## S.npnct19.log A.npnct19.log 12.862366
## S.npnct21.log A.npnct21.log 6531.000000
## S.npnct22.log <NA> 0.000000
## S.npnct23.log <NA> 6531.000000
## S.npnct24.log <NA> 0.000000
## S.npnct25.log <NA> 0.000000
## S.npnct26.log <NA> 0.000000
## S.npnct27.log <NA> 0.000000
## S.npnct29.log <NA> 0.000000
## S.npnct30.log <NA> 0.000000
## S.nwrds.unq.log S.nchrs.log 1.054206
## S.P.daily.clip.report <NA> 104.354839
## S.P.http <NA> 0.000000
## S.sum.TfIdf A.sum.TfIdf 2.583333
## S.T.articl A.T.articl 85.500000
## S.T.compani A.T.compani 137.111111
## S.T.fashion H.T.X2015 59.245283
## S.T.first A.T.first 225.250000
## S.T.intern A.T.intern 140.400000
## S.T.newyork A.T.newyork 149.547619
## S.T.newyorktim A.T.newyorktim 84.540541
## S.T.photo A.T.photo 70.400000
## S.T.report A.T.report 80.371795
## S.T.senat A.T.senat 372.352941
## S.T.week A.T.week 56.560748
## S.T.will A.T.will 119.340000
## S.T.word A.T.word 133.125000
## S.T.year A.T.year 160.815789
## UniqueID <NA> 1.000000
## WordCount <NA> 2.315789
## percentUnique zeroVar nzv myNearZV
## WordCount.log 24.14268218 FALSE FALSE FALSE
## H.P.readers.respond 0.03061849 FALSE TRUE FALSE
## myCategory.fctr 0.30618494 FALSE FALSE FALSE
## H.npnct19.log 0.06123699 FALSE FALSE FALSE
## H.npnct15.log 0.04592774 FALSE FALSE FALSE
## .clusterid.fctr 1.17881200 FALSE FALSE FALSE
## A.npnct13.log 0.16840171 FALSE FALSE FALSE
## A.npnct19.log 0.07654623 FALSE FALSE FALSE
## S.nuppr.log 0.33680343 FALSE FALSE FALSE
## S.T.diari 0.18371096 FALSE TRUE FALSE
## H.T.word 0.13778322 FALSE TRUE FALSE
## H.npnct08.log 0.03061849 FALSE TRUE FALSE
## H.T.read 0.16840171 FALSE TRUE FALSE
## H.ndgts.log 0.18371096 FALSE FALSE FALSE
## S.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## S.ratio.sum.TfIdf.nwrds 94.45805266 FALSE FALSE FALSE
## A.T.newyork 0.44396816 FALSE TRUE FALSE
## H.nuppr.log 0.29087569 FALSE FALSE FALSE
## S.T.make 0.44396816 FALSE TRUE FALSE
## PubDate.wkday.fctr 0.10716473 FALSE FALSE FALSE
## H.nstopwrds.log 0.12247397 FALSE FALSE FALSE
## H.ratio.nstopwrds.nwrds 0.96448255 FALSE FALSE FALSE
## H.npnct11.log 0.07654623 FALSE FALSE FALSE
## S.T.can 0.41334966 FALSE TRUE FALSE
## H.P.no.comment.colon 0.03061849 FALSE TRUE FALSE
## H.P.friday.night.music 0.03061849 FALSE TRUE FALSE
## A.T.newyorktim 0.32149418 FALSE TRUE FALSE
## S.npnct04.log 0.07654623 FALSE TRUE FALSE
## H.T.newyork 0.15309247 FALSE TRUE FALSE
## S.T.share 0.38273117 FALSE TRUE FALSE
## S.npnct08.log 0.06123699 FALSE TRUE FALSE
## H.sum.TfIdf 84.44580527 FALSE FALSE FALSE
## H.P.recap.colon 0.03061849 FALSE TRUE FALSE
## S.T.one 0.44396816 FALSE TRUE FALSE
## H.npnct07.log 0.12247397 FALSE FALSE FALSE
## PubDate.last10.log 79.05695040 FALSE FALSE FALSE
## H.T.report 0.16840171 FALSE TRUE FALSE
## A.nwrds.unq.log 0.55113288 FALSE FALSE FALSE
## A.T.report 0.38273117 FALSE TRUE FALSE
## PubDate.hour.fctr 0.04592774 FALSE FALSE FALSE
## A.T.articl 0.29087569 FALSE TRUE FALSE
## A.sum.TfIdf 94.27434170 FALSE FALSE FALSE
## S.nstopwrds.log 0.38273117 FALSE FALSE FALSE
## PubDate.minute.fctr 0.06123699 FALSE FALSE FALSE
## H.T.polit 0.13778322 FALSE TRUE FALSE
## S.ratio.nstopwrds.nwrds 3.75076546 FALSE FALSE FALSE
## A.T.intern 0.32149418 FALSE TRUE FALSE
## S.T.time 0.42865891 FALSE TRUE FALSE
## H.npnct12.log 0.09185548 FALSE FALSE FALSE
## S.T.take 0.38273117 FALSE TRUE FALSE
## H.T.art 0.19902021 FALSE TRUE FALSE
## H.npnct13.log 0.12247397 FALSE TRUE FALSE
## PubDate.second.fctr 0.06123699 FALSE FALSE FALSE
## H.T.week 0.16840171 FALSE TRUE FALSE
## H.T.get 0.18371096 FALSE TRUE FALSE
## S.npnct01.log 0.06123699 FALSE TRUE FALSE
## A.T.will 0.59706062 FALSE TRUE FALSE
## S.T.show 0.38273117 FALSE TRUE FALSE
## H.T.new 0.19902021 FALSE TRUE FALSE
## .rnorm 99.98469075 FALSE FALSE FALSE
## H.ratio.sum.TfIdf.nwrds 90.46233925 FALSE FALSE FALSE
## S.ndgts.log 0.26025720 FALSE FALSE FALSE
## H.T.say 0.16840171 FALSE TRUE FALSE
## A.T.first 0.42865891 FALSE TRUE FALSE
## A.T.photo 0.27556644 FALSE TRUE FALSE
## H.T.china 0.16840171 FALSE TRUE FALSE
## H.npnct01.log 0.04592774 FALSE TRUE FALSE
## H.T.make 0.13778322 FALSE TRUE FALSE
## A.T.senat 0.50520514 FALSE TRUE FALSE
## S.T.said 0.38273117 FALSE TRUE FALSE
## S.T.day 0.41334966 FALSE TRUE FALSE
## H.npnct28.log 0.03061849 FALSE TRUE FALSE
## H.T.news 0.15309247 FALSE TRUE FALSE
## H.npnct16.log 0.06123699 FALSE TRUE FALSE
## H.T.take 0.15309247 FALSE TRUE FALSE
## S.npnct12.log 0.09185548 FALSE FALSE FALSE
## H.T.busi 0.18371096 FALSE TRUE FALSE
## A.T.compani 0.48989590 FALSE TRUE FALSE
## S.npnct11.log 0.13778322 FALSE FALSE FALSE
## H.T.day 0.18371096 FALSE TRUE FALSE
## A.T.word 0.30618494 FALSE TRUE FALSE
## H.P.facts.figures 0.03061849 FALSE TRUE FALSE
## H.T.X2014 0.13778322 FALSE TRUE FALSE
## PubDate.last1.log 36.49724434 FALSE FALSE FALSE
## S.T.obama 0.38273117 FALSE TRUE FALSE
## PubDate.date.fctr 0.07654623 FALSE FALSE FALSE
## H.T.big 0.19902021 FALSE TRUE FALSE
## S.npnct14.log 0.04592774 FALSE TRUE FALSE
## A.npnct16.log 0.04592774 FALSE TRUE FALSE
## S.npnct06.log 0.03061849 FALSE TRUE FALSE
## S.T.appear 0.30618494 FALSE TRUE FALSE
## PubDate.last100.log 92.19228414 FALSE FALSE FALSE
## PubDate.wkend 0.03061849 FALSE FALSE FALSE
## H.T.ebola 0.16840171 FALSE TRUE FALSE
## H.nwrds.log 0.32149418 FALSE FALSE FALSE
## H.T.obama 0.16840171 FALSE TRUE FALSE
## A.T.year 0.48989590 FALSE TRUE FALSE
## A.nchrs.log 4.39375383 FALSE FALSE FALSE
## H.T.test 0.13778322 FALSE TRUE FALSE
## A.T.week 0.47458665 FALSE TRUE FALSE
## H.T.pictur 0.10716473 FALSE TRUE FALSE
## S.nwrds.log 0.73484385 FALSE FALSE FALSE
## H.T.newyorktim 0.12247397 FALSE TRUE FALSE
## S.npnct15.log 0.04592774 FALSE FALSE FALSE
## H.T.bank 0.13778322 FALSE TRUE FALSE
## H.T.billion 0.13778322 FALSE TRUE FALSE
## S.T.new 0.47458665 FALSE TRUE FALSE
## A.T.fashion 0.39804042 FALSE TRUE FALSE
## H.P.fashion.week 0.03061849 FALSE TRUE FALSE
## S.T.archiv 0.24494795 FALSE TRUE FALSE
## S.T.herald 0.24494795 FALSE TRUE FALSE
## H.T.springsumm 0.09185548 FALSE TRUE FALSE
## S.T.tribun 0.24494795 FALSE TRUE FALSE
## H.T.deal 0.13778322 FALSE TRUE FALSE
## H.P.first.draft 0.03061849 FALSE TRUE FALSE
## S.npnct28.log 0.04592774 FALSE TRUE FALSE
## H.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## H.P.today.in.smallbusiness 0.03061849 FALSE TRUE FALSE
## H.P.verbatim.colon 0.03061849 FALSE TRUE FALSE
## S.P.first.draft 0.03061849 FALSE TRUE FALSE
## H.npnct02.log 0.03061849 FALSE TRUE FALSE
## H.P.quandary 0.03061849 FALSE TRUE FALSE
## S.npnct20.log 0.03061849 FALSE TRUE FALSE
## S.npnct03.log 0.03061849 FALSE TRUE FALSE
## A.npnct18.log 0.06123699 FALSE TRUE FALSE
## A.T.presid 0.45927740 FALSE TRUE FALSE
## S.T.presid 0.42865891 FALSE TRUE FALSE
## S.P.year.colon 0.03061849 FALSE TRUE FALSE
## H.P.on.this.day 0.03061849 FALSE TRUE FALSE
## H.npnct05.log 0.03061849 FALSE TRUE FALSE
## S.npnct07.log 0.04592774 FALSE TRUE FALSE
## S.P.fashion.week 0.03061849 FALSE TRUE FALSE
## H.P.s.notebook 0.03061849 FALSE TRUE FALSE
## .clusterid 1.17881200 FALSE FALSE FALSE
## A.ndgts.log 0.29087569 FALSE FALSE FALSE
## A.npnct01.log 0.06123699 FALSE TRUE FALSE
## A.npnct02.log 0.04592774 FALSE TRUE FALSE
## A.npnct03.log 0.03061849 FALSE TRUE FALSE
## A.npnct04.log 0.07654623 FALSE TRUE FALSE
## A.npnct05.log 0.01530925 TRUE TRUE TRUE
## A.npnct06.log 0.03061849 FALSE TRUE FALSE
## A.npnct07.log 0.04592774 FALSE TRUE FALSE
## A.npnct08.log 0.06123699 FALSE TRUE FALSE
## A.npnct09.log 0.01530925 TRUE TRUE TRUE
## A.npnct10.log 0.03061849 FALSE TRUE TRUE
## A.npnct11.log 0.13778322 FALSE FALSE FALSE
## A.npnct12.log 0.12247397 FALSE FALSE FALSE
## A.npnct14.log 0.10716473 FALSE TRUE FALSE
## A.npnct15.log 0.04592774 FALSE FALSE FALSE
## A.npnct17.log 0.04592774 FALSE TRUE FALSE
## A.npnct20.log 0.03061849 FALSE TRUE FALSE
## A.npnct21.log 0.04592774 FALSE TRUE TRUE
## A.npnct22.log 0.01530925 TRUE TRUE TRUE
## A.npnct23.log 0.04592774 FALSE TRUE TRUE
## A.npnct24.log 0.01530925 TRUE TRUE TRUE
## A.npnct25.log 0.03061849 FALSE TRUE TRUE
## A.npnct26.log 0.01530925 TRUE TRUE TRUE
## A.npnct27.log 0.01530925 TRUE TRUE TRUE
## A.npnct28.log 0.04592774 FALSE TRUE FALSE
## A.npnct29.log 0.01530925 TRUE TRUE TRUE
## A.npnct30.log 0.01530925 TRUE TRUE TRUE
## A.nstopwrds.log 0.42865891 FALSE FALSE FALSE
## A.nuppr.log 0.33680343 FALSE FALSE FALSE
## A.nwrds.log 0.93386405 FALSE FALSE FALSE
## A.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## A.P.fashion.week 0.03061849 FALSE TRUE FALSE
## A.P.first.draft 0.03061849 FALSE TRUE FALSE
## A.P.http 0.04592774 FALSE TRUE FALSE
## A.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## A.P.year.colon 0.03061849 FALSE TRUE FALSE
## A.ratio.nstopwrds.nwrds 4.10287814 FALSE FALSE FALSE
## A.ratio.sum.TfIdf.nwrds 94.51928965 FALSE FALSE FALSE
## A.T.appear 0.30618494 FALSE TRUE FALSE
## A.T.archiv 0.24494795 FALSE TRUE FALSE
## A.T.can 0.48989590 FALSE TRUE FALSE
## A.T.day 0.44396816 FALSE TRUE FALSE
## A.T.diari 0.18371096 FALSE TRUE FALSE
## A.T.herald 0.24494795 FALSE TRUE FALSE
## A.T.make 0.44396816 FALSE TRUE FALSE
## A.T.new 0.48989590 FALSE TRUE FALSE
## A.T.obama 0.42865891 FALSE TRUE FALSE
## A.T.one 0.48989590 FALSE TRUE FALSE
## A.T.said 0.41334966 FALSE TRUE FALSE
## A.T.share 0.38273117 FALSE TRUE FALSE
## A.T.show 0.39804042 FALSE TRUE FALSE
## A.T.take 0.42865891 FALSE TRUE FALSE
## A.T.time 0.42865891 FALSE TRUE FALSE
## A.T.tribun 0.24494795 FALSE TRUE FALSE
## H.nchrs.log 1.57685242 FALSE FALSE FALSE
## H.npnct03.log 0.03061849 FALSE TRUE TRUE
## H.npnct04.log 0.04592774 FALSE TRUE FALSE
## H.npnct06.log 0.06123699 FALSE TRUE FALSE
## H.npnct09.log 0.01530925 TRUE TRUE TRUE
## H.npnct10.log 0.03061849 FALSE TRUE TRUE
## H.npnct14.log 0.03061849 FALSE TRUE FALSE
## H.npnct17.log 0.01530925 TRUE TRUE TRUE
## H.npnct18.log 0.01530925 TRUE TRUE TRUE
## H.npnct20.log 0.03061849 FALSE TRUE TRUE
## H.npnct21.log 0.01530925 TRUE TRUE TRUE
## H.npnct22.log 0.01530925 TRUE TRUE TRUE
## H.npnct23.log 0.01530925 TRUE TRUE TRUE
## H.npnct24.log 0.01530925 TRUE TRUE TRUE
## H.npnct25.log 0.01530925 TRUE TRUE TRUE
## H.npnct26.log 0.01530925 TRUE TRUE TRUE
## H.npnct27.log 0.01530925 TRUE TRUE TRUE
## H.npnct29.log 0.01530925 TRUE TRUE TRUE
## H.npnct30.log 0.01530925 TRUE TRUE TRUE
## H.nwrds.unq.log 0.21432945 FALSE FALSE FALSE
## H.P.http 0.01530925 TRUE TRUE TRUE
## H.P.today.in.politic 0.03061849 FALSE TRUE FALSE
## H.P.what.we.are 0.03061849 FALSE TRUE FALSE
## H.P.year.colon 0.03061849 FALSE TRUE FALSE
## H.T.clip 0.03061849 FALSE TRUE FALSE
## H.T.daili 0.16840171 FALSE TRUE FALSE
## H.T.fashion 0.19902021 FALSE TRUE FALSE
## H.T.first 0.15309247 FALSE TRUE FALSE
## H.T.morn 0.07654623 FALSE TRUE FALSE
## H.T.today 0.13778322 FALSE TRUE FALSE
## H.T.X2015 0.10716473 FALSE TRUE FALSE
## Popular 0.03061849 FALSE FALSE FALSE
## Popular.fctr NA NA NA NA
## PubDate.last1 36.49724434 FALSE FALSE FALSE
## PubDate.last10 79.05695040 FALSE FALSE FALSE
## PubDate.last100 92.52908757 FALSE FALSE FALSE
## PubDate.month.fctr 0.04592774 FALSE FALSE FALSE
## PubDate.POSIX 99.86221678 FALSE FALSE FALSE
## PubDate.year.fctr 0.01530925 TRUE TRUE TRUE
## PubDate.zoo 99.86221678 FALSE FALSE FALSE
## S.nchrs.log 3.72014697 FALSE FALSE FALSE
## S.npnct02.log 0.03061849 FALSE TRUE TRUE
## S.npnct05.log 0.01530925 TRUE TRUE TRUE
## S.npnct09.log 0.01530925 TRUE TRUE TRUE
## S.npnct10.log 0.03061849 FALSE TRUE TRUE
## S.npnct13.log 0.16840171 FALSE FALSE FALSE
## S.npnct16.log 0.04592774 FALSE TRUE FALSE
## S.npnct17.log 0.01530925 TRUE TRUE TRUE
## S.npnct18.log 0.01530925 TRUE TRUE TRUE
## S.npnct19.log 0.07654623 FALSE FALSE FALSE
## S.npnct21.log 0.03061849 FALSE TRUE TRUE
## S.npnct22.log 0.01530925 TRUE TRUE TRUE
## S.npnct23.log 0.03061849 FALSE TRUE TRUE
## S.npnct24.log 0.01530925 TRUE TRUE TRUE
## S.npnct25.log 0.01530925 TRUE TRUE TRUE
## S.npnct26.log 0.01530925 TRUE TRUE TRUE
## S.npnct27.log 0.01530925 TRUE TRUE TRUE
## S.npnct29.log 0.01530925 TRUE TRUE TRUE
## S.npnct30.log 0.01530925 TRUE TRUE TRUE
## S.nwrds.unq.log 0.44396816 FALSE FALSE FALSE
## S.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## S.P.http 0.01530925 TRUE TRUE TRUE
## S.sum.TfIdf 94.32026944 FALSE FALSE FALSE
## S.T.articl 0.29087569 FALSE TRUE FALSE
## S.T.compani 0.44396816 FALSE TRUE FALSE
## S.T.fashion 0.38273117 FALSE TRUE FALSE
## S.T.first 0.41334966 FALSE TRUE FALSE
## S.T.intern 0.30618494 FALSE TRUE FALSE
## S.T.newyork 0.41334966 FALSE TRUE FALSE
## S.T.newyorktim 0.33680343 FALSE TRUE FALSE
## S.T.photo 0.29087569 FALSE TRUE FALSE
## S.T.report 0.35211268 FALSE TRUE FALSE
## S.T.senat 0.47458665 FALSE TRUE FALSE
## S.T.week 0.41334966 FALSE TRUE FALSE
## S.T.will 0.55113288 FALSE TRUE FALSE
## S.T.word 0.30618494 FALSE TRUE FALSE
## S.T.year 0.45927740 FALSE TRUE FALSE
## UniqueID 100.00000000 FALSE FALSE FALSE
## WordCount 24.15799143 FALSE FALSE FALSE
## is.cor.y.abs.low rsp_var_raw id_var rsp_var
## WordCount.log FALSE FALSE NA NA
## H.P.readers.respond FALSE FALSE NA NA
## myCategory.fctr FALSE FALSE NA NA
## H.npnct19.log FALSE FALSE NA NA
## H.npnct15.log FALSE FALSE NA NA
## .clusterid.fctr FALSE FALSE NA NA
## A.npnct13.log FALSE FALSE NA NA
## A.npnct19.log FALSE FALSE NA NA
## S.nuppr.log FALSE FALSE NA NA
## S.T.diari FALSE FALSE NA NA
## H.T.word FALSE FALSE NA NA
## H.npnct08.log FALSE FALSE NA NA
## H.T.read FALSE FALSE NA NA
## H.ndgts.log FALSE FALSE NA NA
## S.P.metropolitan.diary.colon FALSE FALSE NA NA
## S.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## A.T.newyork FALSE FALSE NA NA
## H.nuppr.log FALSE FALSE NA NA
## S.T.make FALSE FALSE NA NA
## PubDate.wkday.fctr FALSE FALSE NA NA
## H.nstopwrds.log FALSE FALSE NA NA
## H.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## H.npnct11.log FALSE FALSE NA NA
## S.T.can FALSE FALSE NA NA
## H.P.no.comment.colon FALSE FALSE NA NA
## H.P.friday.night.music FALSE FALSE NA NA
## A.T.newyorktim FALSE FALSE NA NA
## S.npnct04.log FALSE FALSE NA NA
## H.T.newyork FALSE FALSE NA NA
## S.T.share FALSE FALSE NA NA
## S.npnct08.log TRUE FALSE NA NA
## H.sum.TfIdf FALSE FALSE NA NA
## H.P.recap.colon FALSE FALSE NA NA
## S.T.one FALSE FALSE NA NA
## H.npnct07.log FALSE FALSE NA NA
## PubDate.last10.log FALSE FALSE NA NA
## H.T.report FALSE FALSE NA NA
## A.nwrds.unq.log FALSE FALSE NA NA
## A.T.report FALSE FALSE NA NA
## PubDate.hour.fctr FALSE FALSE NA NA
## A.T.articl FALSE FALSE NA NA
## A.sum.TfIdf FALSE FALSE NA NA
## S.nstopwrds.log FALSE FALSE NA NA
## PubDate.minute.fctr FALSE FALSE NA NA
## H.T.polit FALSE FALSE NA NA
## S.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## A.T.intern FALSE FALSE NA NA
## S.T.time FALSE FALSE NA NA
## H.npnct12.log FALSE FALSE NA NA
## S.T.take FALSE FALSE NA NA
## H.T.art FALSE FALSE NA NA
## H.npnct13.log FALSE FALSE NA NA
## PubDate.second.fctr FALSE FALSE NA NA
## H.T.week FALSE FALSE NA NA
## H.T.get FALSE FALSE NA NA
## S.npnct01.log FALSE FALSE NA NA
## A.T.will FALSE FALSE NA NA
## S.T.show FALSE FALSE NA NA
## H.T.new FALSE FALSE NA NA
## .rnorm FALSE FALSE NA NA
## H.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## S.ndgts.log FALSE FALSE NA NA
## H.T.say FALSE FALSE NA NA
## A.T.first FALSE FALSE NA NA
## A.T.photo FALSE FALSE NA NA
## H.T.china FALSE FALSE NA NA
## H.npnct01.log FALSE FALSE NA NA
## H.T.make FALSE FALSE NA NA
## A.T.senat FALSE FALSE NA NA
## S.T.said FALSE FALSE NA NA
## S.T.day FALSE FALSE NA NA
## H.npnct28.log FALSE FALSE NA NA
## H.T.news FALSE FALSE NA NA
## H.npnct16.log FALSE FALSE NA NA
## H.T.take TRUE FALSE NA NA
## S.npnct12.log FALSE FALSE NA NA
## H.T.busi FALSE FALSE NA NA
## A.T.compani FALSE FALSE NA NA
## S.npnct11.log FALSE FALSE NA NA
## H.T.day FALSE FALSE NA NA
## A.T.word FALSE FALSE NA NA
## H.P.facts.figures FALSE FALSE NA NA
## H.T.X2014 FALSE FALSE NA NA
## PubDate.last1.log FALSE FALSE NA NA
## S.T.obama FALSE FALSE NA NA
## PubDate.date.fctr FALSE FALSE NA NA
## H.T.big FALSE FALSE NA NA
## S.npnct14.log FALSE FALSE NA NA
## A.npnct16.log TRUE FALSE NA NA
## S.npnct06.log FALSE FALSE NA NA
## S.T.appear FALSE FALSE NA NA
## PubDate.last100.log TRUE FALSE NA NA
## PubDate.wkend FALSE FALSE NA NA
## H.T.ebola FALSE FALSE NA NA
## H.nwrds.log FALSE FALSE NA NA
## H.T.obama FALSE FALSE NA NA
## A.T.year FALSE FALSE NA NA
## A.nchrs.log FALSE FALSE NA NA
## H.T.test FALSE FALSE NA NA
## A.T.week FALSE FALSE NA NA
## H.T.pictur FALSE FALSE NA NA
## S.nwrds.log FALSE FALSE NA NA
## H.T.newyorktim FALSE FALSE NA NA
## S.npnct15.log FALSE FALSE NA NA
## H.T.bank FALSE FALSE NA NA
## H.T.billion FALSE FALSE NA NA
## S.T.new FALSE FALSE NA NA
## A.T.fashion FALSE FALSE NA NA
## H.P.fashion.week FALSE FALSE NA NA
## S.T.archiv FALSE FALSE NA NA
## S.T.herald FALSE FALSE NA NA
## H.T.springsumm FALSE FALSE NA NA
## S.T.tribun FALSE FALSE NA NA
## H.T.deal FALSE FALSE NA NA
## H.P.first.draft FALSE FALSE NA NA
## S.npnct28.log FALSE FALSE NA NA
## H.P.daily.clip.report FALSE FALSE NA NA
## H.P.today.in.smallbusiness FALSE FALSE NA NA
## H.P.verbatim.colon FALSE FALSE NA NA
## S.P.first.draft FALSE FALSE NA NA
## H.npnct02.log FALSE FALSE NA NA
## H.P.quandary FALSE FALSE NA NA
## S.npnct20.log FALSE FALSE NA NA
## S.npnct03.log FALSE FALSE NA NA
## A.npnct18.log FALSE FALSE NA NA
## A.T.presid TRUE FALSE NA NA
## S.T.presid TRUE FALSE NA NA
## S.P.year.colon FALSE FALSE NA NA
## H.P.on.this.day FALSE FALSE NA NA
## H.npnct05.log FALSE FALSE NA NA
## S.npnct07.log FALSE FALSE NA NA
## S.P.fashion.week FALSE FALSE NA NA
## H.P.s.notebook TRUE FALSE NA NA
## .clusterid FALSE FALSE NA NA
## A.ndgts.log FALSE FALSE NA NA
## A.npnct01.log FALSE FALSE NA NA
## A.npnct02.log FALSE FALSE NA NA
## A.npnct03.log FALSE FALSE NA NA
## A.npnct04.log FALSE FALSE NA NA
## A.npnct05.log NA FALSE NA NA
## A.npnct06.log FALSE FALSE NA NA
## A.npnct07.log FALSE FALSE NA NA
## A.npnct08.log TRUE FALSE NA NA
## A.npnct09.log NA FALSE NA NA
## A.npnct10.log TRUE FALSE NA NA
## A.npnct11.log FALSE FALSE NA NA
## A.npnct12.log FALSE FALSE NA NA
## A.npnct14.log FALSE FALSE NA NA
## A.npnct15.log FALSE FALSE NA NA
## A.npnct17.log FALSE FALSE NA NA
## A.npnct20.log FALSE FALSE NA NA
## A.npnct21.log FALSE FALSE NA NA
## A.npnct22.log NA FALSE NA NA
## A.npnct23.log FALSE FALSE NA NA
## A.npnct24.log TRUE FALSE NA NA
## A.npnct25.log TRUE FALSE NA NA
## A.npnct26.log NA FALSE NA NA
## A.npnct27.log NA FALSE NA NA
## A.npnct28.log FALSE FALSE NA NA
## A.npnct29.log NA FALSE NA NA
## A.npnct30.log NA FALSE NA NA
## A.nstopwrds.log FALSE FALSE NA NA
## A.nuppr.log FALSE FALSE NA NA
## A.nwrds.log FALSE FALSE NA NA
## A.P.daily.clip.report FALSE FALSE NA NA
## A.P.fashion.week FALSE FALSE NA NA
## A.P.first.draft FALSE FALSE NA NA
## A.P.http FALSE FALSE NA NA
## A.P.metropolitan.diary.colon FALSE FALSE NA NA
## A.P.year.colon FALSE FALSE NA NA
## A.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## A.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## A.T.appear FALSE FALSE NA NA
## A.T.archiv FALSE FALSE NA NA
## A.T.can FALSE FALSE NA NA
## A.T.day FALSE FALSE NA NA
## A.T.diari FALSE FALSE NA NA
## A.T.herald FALSE FALSE NA NA
## A.T.make FALSE FALSE NA NA
## A.T.new FALSE FALSE NA NA
## A.T.obama FALSE FALSE NA NA
## A.T.one FALSE FALSE NA NA
## A.T.said FALSE FALSE NA NA
## A.T.share FALSE FALSE NA NA
## A.T.show FALSE FALSE NA NA
## A.T.take FALSE FALSE NA NA
## A.T.time FALSE FALSE NA NA
## A.T.tribun FALSE FALSE NA NA
## H.nchrs.log FALSE FALSE NA NA
## H.npnct03.log FALSE FALSE NA NA
## H.npnct04.log FALSE FALSE NA NA
## H.npnct06.log FALSE FALSE NA NA
## H.npnct09.log NA FALSE NA NA
## H.npnct10.log TRUE FALSE NA NA
## H.npnct14.log FALSE FALSE NA NA
## H.npnct17.log NA FALSE NA NA
## H.npnct18.log NA FALSE NA NA
## H.npnct20.log TRUE FALSE NA NA
## H.npnct21.log NA FALSE NA NA
## H.npnct22.log NA FALSE NA NA
## H.npnct23.log NA FALSE NA NA
## H.npnct24.log TRUE FALSE NA NA
## H.npnct25.log NA FALSE NA NA
## H.npnct26.log NA FALSE NA NA
## H.npnct27.log NA FALSE NA NA
## H.npnct29.log NA FALSE NA NA
## H.npnct30.log NA FALSE NA NA
## H.nwrds.unq.log FALSE FALSE NA NA
## H.P.http NA FALSE NA NA
## H.P.today.in.politic FALSE FALSE NA NA
## H.P.what.we.are FALSE FALSE NA NA
## H.P.year.colon FALSE FALSE NA NA
## H.T.clip FALSE FALSE NA NA
## H.T.daili FALSE FALSE NA NA
## H.T.fashion FALSE FALSE NA NA
## H.T.first FALSE FALSE NA NA
## H.T.morn FALSE FALSE NA NA
## H.T.today FALSE FALSE NA NA
## H.T.X2015 FALSE FALSE NA NA
## Popular FALSE TRUE NA NA
## Popular.fctr NA NA NA TRUE
## PubDate.last1 FALSE FALSE NA NA
## PubDate.last10 FALSE FALSE NA NA
## PubDate.last100 FALSE FALSE NA NA
## PubDate.month.fctr FALSE FALSE NA NA
## PubDate.POSIX FALSE FALSE NA NA
## PubDate.year.fctr NA FALSE NA NA
## PubDate.zoo FALSE FALSE NA NA
## S.nchrs.log FALSE FALSE NA NA
## S.npnct02.log TRUE FALSE NA NA
## S.npnct05.log NA FALSE NA NA
## S.npnct09.log NA FALSE NA NA
## S.npnct10.log TRUE FALSE NA NA
## S.npnct13.log FALSE FALSE NA NA
## S.npnct16.log TRUE FALSE NA NA
## S.npnct17.log NA FALSE NA NA
## S.npnct18.log NA FALSE NA NA
## S.npnct19.log FALSE FALSE NA NA
## S.npnct21.log FALSE FALSE NA NA
## S.npnct22.log NA FALSE NA NA
## S.npnct23.log FALSE FALSE NA NA
## S.npnct24.log TRUE FALSE NA NA
## S.npnct25.log NA FALSE NA NA
## S.npnct26.log NA FALSE NA NA
## S.npnct27.log NA FALSE NA NA
## S.npnct29.log NA FALSE NA NA
## S.npnct30.log NA FALSE NA NA
## S.nwrds.unq.log FALSE FALSE NA NA
## S.P.daily.clip.report FALSE FALSE NA NA
## S.P.http NA FALSE NA NA
## S.sum.TfIdf FALSE FALSE NA NA
## S.T.articl FALSE FALSE NA NA
## S.T.compani FALSE FALSE NA NA
## S.T.fashion FALSE FALSE NA NA
## S.T.first FALSE FALSE NA NA
## S.T.intern FALSE FALSE NA NA
## S.T.newyork FALSE FALSE NA NA
## S.T.newyorktim FALSE FALSE NA NA
## S.T.photo FALSE FALSE NA NA
## S.T.report FALSE FALSE NA NA
## S.T.senat FALSE FALSE NA NA
## S.T.week FALSE FALSE NA NA
## S.T.will FALSE FALSE NA NA
## S.T.word FALSE FALSE NA NA
## S.T.year FALSE FALSE NA NA
## UniqueID FALSE FALSE TRUE NA
## WordCount FALSE FALSE NA NA
## importance Low.cor.X.glm.importance
## WordCount.log 1.000000e+02 1.000000e+02
## H.P.readers.respond 5.146747e+01 5.146747e+01
## myCategory.fctr 4.281250e+01 4.281250e+01
## H.npnct19.log 4.004695e+01 4.004695e+01
## H.npnct15.log 3.467378e+01 3.467378e+01
## .clusterid.fctr 2.932308e+01 2.932308e+01
## A.npnct13.log 2.871074e+01 2.871074e+01
## A.npnct19.log 2.850228e+01 2.850228e+01
## S.nuppr.log 2.501950e+01 2.501950e+01
## S.T.diari 2.124067e+01 2.124067e+01
## H.T.word 2.105883e+01 2.105883e+01
## H.npnct08.log 2.045852e+01 2.045852e+01
## H.T.read 2.019170e+01 2.019170e+01
## H.ndgts.log 1.951144e+01 1.951144e+01
## S.P.metropolitan.diary.colon 1.888990e+01 1.888990e+01
## S.ratio.sum.TfIdf.nwrds 1.854118e+01 1.854118e+01
## A.T.newyork 1.825819e+01 1.825819e+01
## H.nuppr.log 1.817153e+01 1.817153e+01
## S.T.make 1.723536e+01 1.723536e+01
## PubDate.wkday.fctr 1.661497e+01 1.661497e+01
## H.nstopwrds.log 1.627434e+01 1.627434e+01
## H.ratio.nstopwrds.nwrds 1.623508e+01 1.623508e+01
## H.npnct11.log 1.582943e+01 1.582943e+01
## S.T.can 1.557214e+01 1.557214e+01
## H.P.no.comment.colon 1.552028e+01 1.552028e+01
## H.P.friday.night.music 1.488096e+01 1.488096e+01
## A.T.newyorktim 1.471359e+01 1.471359e+01
## S.npnct04.log 1.357564e+01 1.357564e+01
## H.T.newyork 1.337672e+01 1.337672e+01
## S.T.share 1.322366e+01 1.322366e+01
## S.npnct08.log 1.319408e+01 1.319408e+01
## H.sum.TfIdf 1.282767e+01 1.282767e+01
## H.P.recap.colon 1.273820e+01 1.273820e+01
## S.T.one 1.238107e+01 1.238107e+01
## H.npnct07.log 1.222449e+01 1.222449e+01
## PubDate.last10.log 1.193952e+01 1.193952e+01
## H.T.report 1.191324e+01 1.191324e+01
## A.nwrds.unq.log 1.190743e+01 1.190743e+01
## A.T.report 1.155728e+01 1.155728e+01
## PubDate.hour.fctr 1.144174e+01 1.144174e+01
## A.T.articl 1.128851e+01 1.128851e+01
## A.sum.TfIdf 1.110991e+01 1.110991e+01
## S.nstopwrds.log 1.099737e+01 1.099737e+01
## PubDate.minute.fctr 1.055593e+01 1.055593e+01
## H.T.polit 1.032948e+01 1.032948e+01
## S.ratio.nstopwrds.nwrds 1.026798e+01 1.026798e+01
## A.T.intern 9.914726e+00 9.914726e+00
## S.T.time 9.904830e+00 9.904830e+00
## H.npnct12.log 9.888614e+00 9.888614e+00
## S.T.take 9.852005e+00 9.852005e+00
## H.T.art 9.842033e+00 9.842033e+00
## H.npnct13.log 9.829552e+00 9.829552e+00
## PubDate.second.fctr 9.794379e+00 9.794379e+00
## H.T.week 9.718351e+00 9.718351e+00
## H.T.get 9.576775e+00 9.576775e+00
## S.npnct01.log 9.564919e+00 9.564919e+00
## A.T.will 9.491070e+00 9.491070e+00
## S.T.show 9.342530e+00 9.342530e+00
## H.T.new 9.203584e+00 9.203584e+00
## .rnorm 8.941026e+00 8.941026e+00
## H.ratio.sum.TfIdf.nwrds 8.939183e+00 8.939183e+00
## S.ndgts.log 8.755923e+00 8.755923e+00
## H.T.say 8.671813e+00 8.671813e+00
## A.T.first 8.505413e+00 8.505413e+00
## A.T.photo 8.036467e+00 8.036467e+00
## H.T.china 7.799595e+00 7.799595e+00
## H.npnct01.log 7.787548e+00 7.787548e+00
## H.T.make 7.674641e+00 7.674641e+00
## A.T.senat 7.437682e+00 7.437682e+00
## S.T.said 7.364645e+00 7.364645e+00
## S.T.day 7.260435e+00 7.260435e+00
## H.npnct28.log 7.016220e+00 7.016220e+00
## H.T.news 6.901869e+00 6.901869e+00
## H.npnct16.log 6.896467e+00 6.896467e+00
## H.T.take 6.881392e+00 6.881392e+00
## S.npnct12.log 6.296543e+00 6.296543e+00
## H.T.busi 5.892173e+00 5.892173e+00
## A.T.compani 5.701348e+00 5.701348e+00
## S.npnct11.log 5.458567e+00 5.458567e+00
## H.T.day 5.341694e+00 5.341694e+00
## A.T.word 5.341188e+00 5.341188e+00
## H.P.facts.figures 5.332319e+00 5.332319e+00
## H.T.X2014 5.105515e+00 5.105515e+00
## PubDate.last1.log 5.029197e+00 5.029197e+00
## S.T.obama 4.950818e+00 4.950818e+00
## PubDate.date.fctr 4.692266e+00 4.692266e+00
## H.T.big 4.491165e+00 4.491165e+00
## S.npnct14.log 4.449833e+00 4.449833e+00
## A.npnct16.log 4.218694e+00 4.218694e+00
## S.npnct06.log 3.899006e+00 3.899006e+00
## S.T.appear 3.840502e+00 3.840502e+00
## PubDate.last100.log 3.387278e+00 3.387278e+00
## PubDate.wkend 3.323031e+00 3.323031e+00
## H.T.ebola 3.198159e+00 3.198159e+00
## H.nwrds.log 3.088101e+00 3.088101e+00
## H.T.obama 2.703165e+00 2.703165e+00
## A.T.year 2.411870e+00 2.411870e+00
## A.nchrs.log 2.210854e+00 2.210854e+00
## H.T.test 1.856699e+00 1.856699e+00
## A.T.week 1.735453e+00 1.735453e+00
## H.T.pictur 1.728117e+00 1.728117e+00
## S.nwrds.log 1.511344e+00 1.511344e+00
## H.T.newyorktim 1.280498e+00 1.280498e+00
## S.npnct15.log 1.190329e+00 1.190329e+00
## H.T.bank 1.038507e+00 1.038507e+00
## H.T.billion 7.999936e-01 7.999936e-01
## S.T.new 1.565429e-01 1.565429e-01
## A.T.fashion 1.528913e-01 1.528913e-01
## H.P.fashion.week 1.109378e-01 1.109378e-01
## S.T.archiv 9.386317e-02 9.386317e-02
## S.T.herald 8.612681e-02 8.612681e-02
## H.T.springsumm 7.774263e-02 7.774263e-02
## S.T.tribun 7.345077e-02 7.345077e-02
## H.T.deal 6.047754e-02 6.047754e-02
## H.P.first.draft 4.572882e-02 4.572882e-02
## S.npnct28.log 4.463511e-02 4.463511e-02
## H.P.daily.clip.report 3.416759e-02 3.416759e-02
## H.P.today.in.smallbusiness 2.839539e-02 2.839539e-02
## H.P.verbatim.colon 1.634958e-02 1.634958e-02
## S.P.first.draft 1.547771e-02 1.547771e-02
## H.npnct02.log 1.526519e-02 1.526519e-02
## H.P.quandary 1.236942e-02 1.236942e-02
## S.npnct20.log 1.220902e-02 1.220902e-02
## S.npnct03.log 1.207385e-02 1.207385e-02
## A.npnct18.log 1.043696e-02 1.043696e-02
## A.T.presid 9.440632e-03 9.440632e-03
## S.T.presid 9.421346e-03 9.421346e-03
## S.P.year.colon 6.252341e-03 6.252341e-03
## H.P.on.this.day 5.059061e-03 5.059061e-03
## H.npnct05.log 4.614811e-03 4.614811e-03
## S.npnct07.log 2.450155e-03 2.450155e-03
## S.P.fashion.week 8.838548e-04 8.838548e-04
## H.P.s.notebook 0.000000e+00 0.000000e+00
## .clusterid NA NA
## A.ndgts.log NA NA
## A.npnct01.log NA NA
## A.npnct02.log NA NA
## A.npnct03.log NA NA
## A.npnct04.log NA NA
## A.npnct05.log NA NA
## A.npnct06.log NA NA
## A.npnct07.log NA NA
## A.npnct08.log NA NA
## A.npnct09.log NA NA
## A.npnct10.log NA NA
## A.npnct11.log NA NA
## A.npnct12.log NA NA
## A.npnct14.log NA NA
## A.npnct15.log NA NA
## A.npnct17.log NA NA
## A.npnct20.log NA NA
## A.npnct21.log NA NA
## A.npnct22.log NA NA
## A.npnct23.log NA NA
## A.npnct24.log NA NA
## A.npnct25.log NA NA
## A.npnct26.log NA NA
## A.npnct27.log NA NA
## A.npnct28.log NA NA
## A.npnct29.log NA NA
## A.npnct30.log NA NA
## A.nstopwrds.log NA NA
## A.nuppr.log NA NA
## A.nwrds.log NA NA
## A.P.daily.clip.report NA NA
## A.P.fashion.week NA NA
## A.P.first.draft NA NA
## A.P.http NA NA
## A.P.metropolitan.diary.colon NA NA
## A.P.year.colon NA NA
## A.ratio.nstopwrds.nwrds NA NA
## A.ratio.sum.TfIdf.nwrds NA NA
## A.T.appear NA NA
## A.T.archiv NA NA
## A.T.can NA NA
## A.T.day NA NA
## A.T.diari NA NA
## A.T.herald NA NA
## A.T.make NA NA
## A.T.new NA NA
## A.T.obama NA NA
## A.T.one NA NA
## A.T.said NA NA
## A.T.share NA NA
## A.T.show NA NA
## A.T.take NA NA
## A.T.time NA NA
## A.T.tribun NA NA
## H.nchrs.log NA NA
## H.npnct03.log NA NA
## H.npnct04.log NA NA
## H.npnct06.log NA NA
## H.npnct09.log NA NA
## H.npnct10.log NA NA
## H.npnct14.log NA NA
## H.npnct17.log NA NA
## H.npnct18.log NA NA
## H.npnct20.log NA NA
## H.npnct21.log NA NA
## H.npnct22.log NA NA
## H.npnct23.log NA NA
## H.npnct24.log NA NA
## H.npnct25.log NA NA
## H.npnct26.log NA NA
## H.npnct27.log NA NA
## H.npnct29.log NA NA
## H.npnct30.log NA NA
## H.nwrds.unq.log NA NA
## H.P.http NA NA
## H.P.today.in.politic NA NA
## H.P.what.we.are NA NA
## H.P.year.colon NA NA
## H.T.clip NA NA
## H.T.daili NA NA
## H.T.fashion NA NA
## H.T.first NA NA
## H.T.morn NA NA
## H.T.today NA NA
## H.T.X2015 NA NA
## Popular NA NA
## Popular.fctr NA NA
## PubDate.last1 NA NA
## PubDate.last10 NA NA
## PubDate.last100 NA NA
## PubDate.month.fctr NA NA
## PubDate.POSIX NA NA
## PubDate.year.fctr NA NA
## PubDate.zoo NA NA
## S.nchrs.log NA NA
## S.npnct02.log NA NA
## S.npnct05.log NA NA
## S.npnct09.log NA NA
## S.npnct10.log NA NA
## S.npnct13.log NA NA
## S.npnct16.log NA NA
## S.npnct17.log NA NA
## S.npnct18.log NA NA
## S.npnct19.log NA NA
## S.npnct21.log NA NA
## S.npnct22.log NA NA
## S.npnct23.log NA NA
## S.npnct24.log NA NA
## S.npnct25.log NA NA
## S.npnct26.log NA NA
## S.npnct27.log NA NA
## S.npnct29.log NA NA
## S.npnct30.log NA NA
## S.nwrds.unq.log NA NA
## S.P.daily.clip.report NA NA
## S.P.http NA NA
## S.sum.TfIdf NA NA
## S.T.articl NA NA
## S.T.compani NA NA
## S.T.fashion NA NA
## S.T.first NA NA
## S.T.intern NA NA
## S.T.newyork NA NA
## S.T.newyorktim NA NA
## S.T.photo NA NA
## S.T.report NA NA
## S.T.senat NA NA
## S.T.week NA NA
## S.T.will NA NA
## S.T.word NA NA
## S.T.year NA NA
## UniqueID NA NA
## WordCount NA NA
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (length(vars <- subset(glb_feats_df, importance > 0)$id) > 5) {
warning("Limiting important feature scatter plots to 5 out of ", length(vars))
vars <- vars[1:5]
}
require(reshape2)
rsp_var_out <- paste0(glb_rsp_var_out, mdl_id)
for (var in vars) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
if (glb_is_regression) {
# plot_vars_df <- subset(glb_feats_df, importance >
# glb_feats_df[glb_feats_df$id == ".rnorm", "importance"])
plot_vars_df <- orderBy(~ -importance, glb_feats_df)
if (nrow(plot_vars_df) == 0)
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2],
".rownames"),
feat_y=plot_vars_df$id[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_vars)
# + facet_wrap(reformulate(plot_vars_df$id[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (nrow(plot_vars_df <- subset(glb_feats_df, importance > 0)) == 0)
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2],
".rownames"),
feat_y=plot_vars_df$id[1],
rsp_var=glb_rsp_var,
rsp_var_out=rsp_var_out,
id_vars=glb_id_vars,
prob_threshold=prob_threshold)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_OOBobs_df, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 132
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.predict.Low.cor.X.glm.prob
## 172 172 Y 3.182151e-03
## 1273 1273 Y 1.055980e-01
## 4 4 Y 1.632416e-01
## 37 37 Y 2.914392e-01
## 31 31 N 1.038794e-02
## 937 937 N 9.262422e-09
## 6018 6018 N 4.704354e-05
## 6370 6370 Y 4.663571e-01
## 194 194 N 3.748948e-01
## 24 24 N 3.930412e-01
## 17 17 N 9.789891e-01
## Popular.fctr.predict.Low.cor.X.glm
## 172 N
## 1273 N
## 4 N
## 37 N
## 31 N
## 937 N
## 6018 N
## 6370 Y
## 194 Y
## 24 Y
## 17 Y
## Popular.fctr.predict.Low.cor.X.glm.accurate
## 172 FALSE
## 1273 FALSE
## 4 FALSE
## 37 FALSE
## 31 TRUE
## 937 TRUE
## 6018 TRUE
## 6370 TRUE
## 194 FALSE
## 24 FALSE
## 17 FALSE
## Popular.fctr.predict.Low.cor.X.glm.error .label
## 172 -0.296817849 172
## 1273 -0.194402049 1273
## 4 -0.136758430 4
## 37 -0.008560785 37
## 31 0.000000000 31
## 937 0.000000000 937
## 6018 0.000000000 6018
## 6370 0.000000000 6370
## 194 0.074894769 194
## 24 0.093041154 24
## 17 0.678989145 17
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.predict.Low.cor.X.glm.prob
## 3743 3743 Y 2.220446e-16
## 5423 5423 Y 2.220446e-16
## 5573 5573 Y 3.802754e-11
## 6387 6387 Y 9.740336e-11
## 2026 2026 Y 1.208196e-10
## 5486 5486 Y 4.599563e-10
## Popular.fctr.predict.Low.cor.X.glm
## 3743 N
## 5423 N
## 5573 N
## 6387 N
## 2026 N
## 5486 N
## Popular.fctr.predict.Low.cor.X.glm.accurate
## 3743 FALSE
## 5423 FALSE
## 5573 FALSE
## 6387 FALSE
## 2026 FALSE
## 5486 FALSE
## Popular.fctr.predict.Low.cor.X.glm.error
## 3743 -0.3
## 5423 -0.3
## 5573 -0.3
## 6387 -0.3
## 2026 -0.3
## 5486 -0.3
## UniqueID Popular.fctr Popular.fctr.predict.Low.cor.X.glm.prob
## 3882 3882 Y 6.742752e-10
## 172 172 Y 3.182151e-03
## 2716 2716 Y 1.273974e-01
## 5834 5834 N 4.045917e-01
## 2382 2382 N 6.507893e-01
## 4134 4134 N 7.177183e-01
## Popular.fctr.predict.Low.cor.X.glm
## 3882 N
## 172 N
## 2716 N
## 5834 Y
## 2382 Y
## 4134 Y
## Popular.fctr.predict.Low.cor.X.glm.accurate
## 3882 FALSE
## 172 FALSE
## 2716 FALSE
## 5834 FALSE
## 2382 FALSE
## 4134 FALSE
## Popular.fctr.predict.Low.cor.X.glm.error
## 3882 -0.3000000
## 172 -0.2968178
## 2716 -0.1726026
## 5834 0.1045917
## 2382 0.3507893
## 4134 0.4177183
## UniqueID Popular.fctr Popular.fctr.predict.Low.cor.X.glm.prob
## 725 725 N 0.9687829
## 17 17 N 0.9789891
## 770 770 N 0.9828067
## 4882 4882 N 0.9829636
## 4975 4975 N 0.9904139
## 1667 1667 N 1.0000000
## Popular.fctr.predict.Low.cor.X.glm
## 725 Y
## 17 Y
## 770 Y
## 4882 Y
## 4975 Y
## 1667 Y
## Popular.fctr.predict.Low.cor.X.glm.accurate
## 725 FALSE
## 17 FALSE
## 770 FALSE
## 4882 FALSE
## 4975 FALSE
## 1667 FALSE
## Popular.fctr.predict.Low.cor.X.glm.error
## 725 0.6687829
## 17 0.6789891
## 770 0.6828067
## 4882 0.6829636
## 4975 0.6904139
## 1667 0.7000000
# gather predictions from models better than MFO.*
#mdl_id <- "Conditional.X.rf"
#mdl_id <- "Conditional.X.cp.0.rpart"
#mdl_id <- "Conditional.X.rpart"
# glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, mdl_id,
# glb_rsp_var_out)
# print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, mdl_id)],
# glb_OOBobs_df[, glb_rsp_var])$table))
FN_OOB_ids <- c(4721, 4020, 693, 92)
print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
## [1] Popular.fctr
## [2] Popular.fctr.predict.Low.cor.X.glm.prob
## [3] Popular.fctr.predict.Low.cor.X.glm
## [4] Popular.fctr.predict.Low.cor.X.glm.accurate
## <0 rows> (or 0-length row.names)
print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
glb_feats_df$id[1:5]])
## [1] WordCount.log H.P.readers.respond myCategory.fctr
## [4] H.npnct19.log H.npnct15.log
## <0 rows> (or 0-length row.names)
print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
glb_txt_vars])
## [1] Headline Snippet Abstract
## <0 rows> (or 0-length row.names)
write.csv(glb_OOBobs_df[, c("UniqueID",
grep(glb_rsp_var, names(glb_OOBobs_df), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBent.csv"), row.names=FALSE)
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# dsp_tbl(Headline.contains="[Ee]bola")
# sum(sel_obs(Headline.contains="[Ee]bola"))
# ftable(xtabs(Popular ~ NewsDesk.fctr, data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,]))
# xtabs(NewsDesk ~ Popular, #Popular ~ NewsDesk.fctr,
# data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# exclude=NULL)
# print(mycreate_xtab_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# tbl_col_names=c("Popular", "NewsDesk")))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 12 fit.models 7 2 489.070 505.385 16.315
## 13 fit.models 7 3 505.386 NA NA
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "PubDate.year.fctr" "H.npnct03.log" "H.npnct09.log"
## [4] "H.npnct10.log" "H.npnct17.log" "H.npnct18.log"
## [7] "H.npnct20.log" "H.npnct21.log" "H.npnct22.log"
## [10] "H.npnct23.log" "H.npnct24.log" "H.npnct25.log"
## [13] "H.npnct26.log" "H.npnct27.log" "H.npnct29.log"
## [16] "H.npnct30.log" "H.P.http" "S.npnct02.log"
## [19] "S.npnct05.log" "S.npnct09.log" "S.npnct10.log"
## [22] "S.npnct17.log" "S.npnct18.log" "S.npnct21.log"
## [25] "S.npnct22.log" "S.npnct23.log" "S.npnct24.log"
## [28] "S.npnct25.log" "S.npnct26.log" "S.npnct27.log"
## [31] "S.npnct29.log" "S.npnct30.log" "S.P.http"
## [34] "A.npnct05.log" "A.npnct09.log" "A.npnct10.log"
## [37] "A.npnct21.log" "A.npnct22.log" "A.npnct23.log"
## [40] "A.npnct24.log" "A.npnct25.log" "A.npnct26.log"
## [43] "A.npnct27.log" "A.npnct29.log" "A.npnct30.log"
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## [1] "PubDate.year.fctr" "H.npnct03.log" "H.npnct09.log"
## [4] "H.npnct10.log" "H.npnct17.log" "H.npnct18.log"
## [7] "H.npnct20.log" "H.npnct21.log" "H.npnct22.log"
## [10] "H.npnct23.log" "H.npnct24.log" "H.npnct25.log"
## [13] "H.npnct26.log" "H.npnct27.log" "H.npnct29.log"
## [16] "H.npnct30.log" "H.P.http" "S.npnct02.log"
## [19] "S.npnct05.log" "S.npnct09.log" "S.npnct10.log"
## [22] "S.npnct17.log" "S.npnct18.log" "S.npnct21.log"
## [25] "S.npnct22.log" "S.npnct23.log" "S.npnct24.log"
## [28] "S.npnct25.log" "S.npnct26.log" "S.npnct27.log"
## [31] "S.npnct29.log" "S.npnct30.log" "S.P.http"
## [34] "A.npnct05.log" "A.npnct09.log" "A.npnct10.log"
## [37] "A.npnct21.log" "A.npnct22.log" "A.npnct23.log"
## [40] "A.npnct24.log" "A.npnct25.log" "A.npnct26.log"
## [43] "A.npnct27.log" "A.npnct29.log" "A.npnct30.log"
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## [1] "PubDate.year.fctr"
## [2] "H.npnct03.log"
## [3] "H.npnct09.log"
## [4] "H.npnct10.log"
## [5] "H.npnct17.log"
## [6] "H.npnct18.log"
## [7] "H.npnct20.log"
## [8] "H.npnct21.log"
## [9] "H.npnct22.log"
## [10] "H.npnct23.log"
## [11] "H.npnct24.log"
## [12] "H.npnct25.log"
## [13] "H.npnct26.log"
## [14] "H.npnct27.log"
## [15] "H.npnct29.log"
## [16] "H.npnct30.log"
## [17] "H.P.http"
## [18] "S.npnct02.log"
## [19] "S.npnct05.log"
## [20] "S.npnct09.log"
## [21] "S.npnct10.log"
## [22] "S.npnct17.log"
## [23] "S.npnct18.log"
## [24] "S.npnct21.log"
## [25] "S.npnct22.log"
## [26] "S.npnct23.log"
## [27] "S.npnct24.log"
## [28] "S.npnct25.log"
## [29] "S.npnct26.log"
## [30] "S.npnct27.log"
## [31] "S.npnct29.log"
## [32] "S.npnct30.log"
## [33] "S.P.http"
## [34] "A.npnct05.log"
## [35] "A.npnct09.log"
## [36] "A.npnct10.log"
## [37] "A.npnct21.log"
## [38] "A.npnct22.log"
## [39] "A.npnct23.log"
## [40] "A.npnct24.log"
## [41] "A.npnct25.log"
## [42] "A.npnct26.log"
## [43] "A.npnct27.log"
## [44] "A.npnct29.log"
## [45] "A.npnct30.log"
## [46] "Popular.fctr.predict.Low.cor.X.glm.prob"
## [47] "Popular.fctr.predict.Low.cor.X.glm"
## [48] "Popular.fctr.predict.Low.cor.X.glm.accurate"
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 13 fit.models 7 3 505.386 514.309 8.923
## 14 fit.data.training 8 0 514.310 NA NA
8.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
# To create specific models
# glb_fin_mdl_id <- NULL; glb_fin_mdl <- NULL;
# glb_sel_mdl_id <- "Conditional.X.cp.0.rpart";
# glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]]; print(glb_sel_mdl)
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_sel_mdl
} else {
print(mdl_feats_df <- myextract_mdl_feats(sel_mdl=glb_sel_mdl,
entity_df=glb_fitobs_df))
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the model_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
tune_finmdl_df <- NULL
if (nrow(glb_sel_mdl$bestTune) > 0) {
for (param in names(glb_sel_mdl$bestTune)) {
#print(sprintf("param: %s", param))
if (glb_sel_mdl$bestTune[1, param] != "none")
tune_finmdl_df <- rbind(tune_finmdl_df,
data.frame(parameter=param,
min=glb_sel_mdl$bestTune[1, param],
max=glb_sel_mdl$bestTune[1, param],
by=1)) # by val does not matter
}
}
# Sync with parameters in mydsutils.R
ret_lst <- myfit_mdl(model_id="Final", model_method=model_method,
indep_vars_vctr=mdl_feats_df$id, model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_trnobs_df, OOB_df=NULL,
n_cv_folds=glb_n_cv_folds, tune_models_df=tune_finmdl_df,
# Automate from here
# Issues if glb_sel_mdl$method == "rf" b/c trainControl is "oob"; not "cv"
model_loss_mtrx=glb_model_metric_terms,
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize)
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "model_id"]
}
## id importance
## WordCount.log WordCount.log 1.000000e+02
## H.P.readers.respond H.P.readers.respond 5.146747e+01
## myCategory.fctr myCategory.fctr 4.281250e+01
## H.npnct19.log H.npnct19.log 4.004695e+01
## H.npnct15.log H.npnct15.log 3.467378e+01
## .clusterid.fctr .clusterid.fctr 2.932308e+01
## A.npnct13.log A.npnct13.log 2.871074e+01
## A.npnct19.log A.npnct19.log 2.850228e+01
## S.nuppr.log S.nuppr.log 2.501950e+01
## S.T.diari S.T.diari 2.124067e+01
## H.T.word H.T.word 2.105883e+01
## H.npnct08.log H.npnct08.log 2.045852e+01
## H.T.read H.T.read 2.019170e+01
## H.ndgts.log H.ndgts.log 1.951144e+01
## S.P.metropolitan.diary.colon S.P.metropolitan.diary.colon 1.888990e+01
## S.ratio.sum.TfIdf.nwrds S.ratio.sum.TfIdf.nwrds 1.854118e+01
## A.T.newyork A.T.newyork 1.825819e+01
## H.nuppr.log H.nuppr.log 1.817153e+01
## S.T.make S.T.make 1.723536e+01
## PubDate.wkday.fctr PubDate.wkday.fctr 1.661497e+01
## H.nstopwrds.log H.nstopwrds.log 1.627434e+01
## H.ratio.nstopwrds.nwrds H.ratio.nstopwrds.nwrds 1.623508e+01
## H.npnct11.log H.npnct11.log 1.582943e+01
## S.T.can S.T.can 1.557214e+01
## H.P.no.comment.colon H.P.no.comment.colon 1.552028e+01
## H.P.friday.night.music H.P.friday.night.music 1.488096e+01
## A.T.newyorktim A.T.newyorktim 1.471359e+01
## S.npnct04.log S.npnct04.log 1.357564e+01
## H.T.newyork H.T.newyork 1.337672e+01
## S.T.share S.T.share 1.322366e+01
## S.npnct08.log S.npnct08.log 1.319408e+01
## H.sum.TfIdf H.sum.TfIdf 1.282767e+01
## H.P.recap.colon H.P.recap.colon 1.273820e+01
## S.T.one S.T.one 1.238107e+01
## H.npnct07.log H.npnct07.log 1.222449e+01
## PubDate.last10.log PubDate.last10.log 1.193952e+01
## H.T.report H.T.report 1.191324e+01
## A.nwrds.unq.log A.nwrds.unq.log 1.190743e+01
## A.T.report A.T.report 1.155728e+01
## PubDate.hour.fctr PubDate.hour.fctr 1.144174e+01
## A.T.articl A.T.articl 1.128851e+01
## A.sum.TfIdf A.sum.TfIdf 1.110991e+01
## S.nstopwrds.log S.nstopwrds.log 1.099737e+01
## PubDate.minute.fctr PubDate.minute.fctr 1.055593e+01
## H.T.polit H.T.polit 1.032948e+01
## S.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.026798e+01
## A.T.intern A.T.intern 9.914726e+00
## S.T.time S.T.time 9.904830e+00
## H.npnct12.log H.npnct12.log 9.888614e+00
## S.T.take S.T.take 9.852005e+00
## H.T.art H.T.art 9.842033e+00
## H.npnct13.log H.npnct13.log 9.829552e+00
## PubDate.second.fctr PubDate.second.fctr 9.794379e+00
## H.T.week H.T.week 9.718351e+00
## H.T.get H.T.get 9.576775e+00
## S.npnct01.log S.npnct01.log 9.564919e+00
## A.T.will A.T.will 9.491070e+00
## S.T.show S.T.show 9.342530e+00
## H.T.new H.T.new 9.203584e+00
## .rnorm .rnorm 8.941026e+00
## H.ratio.sum.TfIdf.nwrds H.ratio.sum.TfIdf.nwrds 8.939183e+00
## S.ndgts.log S.ndgts.log 8.755923e+00
## H.T.say H.T.say 8.671813e+00
## A.T.first A.T.first 8.505413e+00
## A.T.photo A.T.photo 8.036467e+00
## H.T.china H.T.china 7.799595e+00
## H.npnct01.log H.npnct01.log 7.787548e+00
## H.T.make H.T.make 7.674641e+00
## A.T.senat A.T.senat 7.437682e+00
## S.T.said S.T.said 7.364645e+00
## S.T.day S.T.day 7.260435e+00
## H.npnct28.log H.npnct28.log 7.016220e+00
## H.T.news H.T.news 6.901869e+00
## H.npnct16.log H.npnct16.log 6.896467e+00
## H.T.take H.T.take 6.881392e+00
## S.npnct12.log S.npnct12.log 6.296543e+00
## H.T.busi H.T.busi 5.892173e+00
## A.T.compani A.T.compani 5.701348e+00
## S.npnct11.log S.npnct11.log 5.458567e+00
## H.T.day H.T.day 5.341694e+00
## A.T.word A.T.word 5.341188e+00
## H.P.facts.figures H.P.facts.figures 5.332319e+00
## H.T.X2014 H.T.X2014 5.105515e+00
## PubDate.last1.log PubDate.last1.log 5.029197e+00
## S.T.obama S.T.obama 4.950818e+00
## PubDate.date.fctr PubDate.date.fctr 4.692266e+00
## H.T.big H.T.big 4.491165e+00
## S.npnct14.log S.npnct14.log 4.449833e+00
## A.npnct16.log A.npnct16.log 4.218694e+00
## S.npnct06.log S.npnct06.log 3.899006e+00
## S.T.appear S.T.appear 3.840502e+00
## PubDate.last100.log PubDate.last100.log 3.387278e+00
## PubDate.wkend PubDate.wkend 3.323031e+00
## H.T.ebola H.T.ebola 3.198159e+00
## H.nwrds.log H.nwrds.log 3.088101e+00
## H.T.obama H.T.obama 2.703165e+00
## A.T.year A.T.year 2.411870e+00
## A.nchrs.log A.nchrs.log 2.210854e+00
## H.T.test H.T.test 1.856699e+00
## A.T.week A.T.week 1.735453e+00
## H.T.pictur H.T.pictur 1.728117e+00
## S.nwrds.log S.nwrds.log 1.511344e+00
## H.T.newyorktim H.T.newyorktim 1.280498e+00
## S.npnct15.log S.npnct15.log 1.190329e+00
## H.T.bank H.T.bank 1.038507e+00
## H.T.billion H.T.billion 7.999936e-01
## S.T.new S.T.new 1.565429e-01
## A.T.fashion A.T.fashion 1.528913e-01
## H.P.fashion.week H.P.fashion.week 1.109378e-01
## S.T.archiv S.T.archiv 9.386317e-02
## S.T.herald S.T.herald 8.612681e-02
## H.T.springsumm H.T.springsumm 7.774263e-02
## S.T.tribun S.T.tribun 7.345077e-02
## H.T.deal H.T.deal 6.047754e-02
## H.P.first.draft H.P.first.draft 4.572882e-02
## S.npnct28.log S.npnct28.log 4.463511e-02
## H.P.daily.clip.report H.P.daily.clip.report 3.416759e-02
## H.P.today.in.smallbusiness H.P.today.in.smallbusiness 2.839539e-02
## H.P.verbatim.colon H.P.verbatim.colon 1.634958e-02
## S.P.first.draft S.P.first.draft 1.547771e-02
## H.npnct02.log H.npnct02.log 1.526519e-02
## H.P.quandary H.P.quandary 1.236942e-02
## S.npnct20.log S.npnct20.log 1.220902e-02
## S.npnct03.log S.npnct03.log 1.207385e-02
## A.npnct18.log A.npnct18.log 1.043696e-02
## A.T.presid A.T.presid 9.440632e-03
## S.T.presid S.T.presid 9.421346e-03
## S.P.year.colon S.P.year.colon 6.252341e-03
## H.P.on.this.day H.P.on.this.day 5.059061e-03
## H.npnct05.log H.npnct05.log 4.614811e-03
## S.npnct07.log S.npnct07.log 2.450155e-03
## S.P.fashion.week S.P.fashion.week 8.838548e-04
## H.P.s.notebook H.P.s.notebook 0.000000e+00
## [1] "fitting model: Final.glm"
## [1] " indep_vars: WordCount.log, H.P.readers.respond, myCategory.fctr, H.npnct19.log, H.npnct15.log, .clusterid.fctr, A.npnct13.log, A.npnct19.log, S.nuppr.log, S.T.diari, H.T.word, H.npnct08.log, H.T.read, H.ndgts.log, S.P.metropolitan.diary.colon, S.ratio.sum.TfIdf.nwrds, A.T.newyork, H.nuppr.log, S.T.make, PubDate.wkday.fctr, H.nstopwrds.log, H.ratio.nstopwrds.nwrds, H.npnct11.log, S.T.can, H.P.no.comment.colon, H.P.friday.night.music, A.T.newyorktim, S.npnct04.log, H.T.newyork, S.T.share, S.npnct08.log, H.sum.TfIdf, H.P.recap.colon, S.T.one, H.npnct07.log, PubDate.last10.log, H.T.report, A.nwrds.unq.log, A.T.report, PubDate.hour.fctr, A.T.articl, A.sum.TfIdf, S.nstopwrds.log, PubDate.minute.fctr, H.T.polit, S.ratio.nstopwrds.nwrds, A.T.intern, S.T.time, H.npnct12.log, S.T.take, H.T.art, H.npnct13.log, PubDate.second.fctr, H.T.week, H.T.get, S.npnct01.log, A.T.will, S.T.show, H.T.new, .rnorm, H.ratio.sum.TfIdf.nwrds, S.ndgts.log, H.T.say, A.T.first, A.T.photo, H.T.china, H.npnct01.log, H.T.make, A.T.senat, S.T.said, S.T.day, H.npnct28.log, H.T.news, H.npnct16.log, H.T.take, S.npnct12.log, H.T.busi, A.T.compani, S.npnct11.log, H.T.day, A.T.word, H.P.facts.figures, H.T.X2014, PubDate.last1.log, S.T.obama, PubDate.date.fctr, H.T.big, S.npnct14.log, A.npnct16.log, S.npnct06.log, S.T.appear, PubDate.last100.log, PubDate.wkend, H.T.ebola, H.nwrds.log, H.T.obama, A.T.year, A.nchrs.log, H.T.test, A.T.week, H.T.pictur, S.nwrds.log, H.T.newyorktim, S.npnct15.log, H.T.bank, H.T.billion, S.T.new, A.T.fashion, H.P.fashion.week, S.T.archiv, S.T.herald, H.T.springsumm, S.T.tribun, H.T.deal, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.P.today.in.smallbusiness, H.P.verbatim.colon, S.P.first.draft, H.npnct02.log, H.P.quandary, S.npnct20.log, S.npnct03.log, A.npnct18.log, A.T.presid, S.T.presid, S.P.year.colon, H.P.on.this.day, H.npnct05.log, S.npnct07.log, S.P.fashion.week, H.P.s.notebook"
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1987 -0.2853 -0.0958 0.0000 3.7784
##
## Coefficients: (6 not defined because of singularities)
## Estimate
## (Intercept) -3.339e+00
## WordCount.log 1.330e+00
## H.P.readers.respond 7.224e+00
## `myCategory.fctrForeign#World#Asia Pacific` -5.184e+00
## `myCategory.fctr#Multimedia#` -5.112e+00
## `myCategory.fctrCulture#Arts#` 1.651e+13
## `myCategory.fctrBusiness#Business Day#Dealbook` -2.959e+00
## myCategory.fctrmyOther -4.504e+15
## `myCategory.fctrBusiness#Technology#` -3.415e+00
## `myCategory.fctrBusiness#Crosswords/Games#` 3.241e-01
## `myCategory.fctrTStyle##` -4.824e+00
## `myCategory.fctrForeign#World#` -2.444e+01
## `myCategory.fctrOpEd#Opinion#` 2.201e+00
## `myCategory.fctrStyles##Fashion` -5.688e+00
## `myCategory.fctr#Opinion#Room For Debate` -7.431e+00
## `myCategory.fctr#U.S.#Education` -3.190e+01
## `myCategory.fctr##` -2.753e+00
## `myCategory.fctrMetro#N.Y. / Region#` -1.964e+00
## `myCategory.fctrBusiness#Business Day#Small Business` -4.500e+00
## `myCategory.fctrStyles#U.S.#` 4.115e-01
## `myCategory.fctrTravel#Travel#` -4.512e+00
## `myCategory.fctr#Opinion#The Public Editor` 1.502e-01
## H.npnct19.log 1.446e+00
## H.npnct15.log -1.348e+00
## .clusterid.fctr101 -1.028e-01
## .clusterid.fctr102 -5.393e-01
## .clusterid.fctr103 -3.119e-01
## .clusterid.fctr104 NA
## .clusterid.fctr401 -1.651e+13
## .clusterid.fctr402 -4.520e+15
## .clusterid.fctr403 -1.651e+13
## .clusterid.fctr404 -1.651e+13
## .clusterid.fctr405 -1.651e+13
## .clusterid.fctr406 -1.651e+13
## .clusterid.fctr407 -4.520e+15
## .clusterid.fctr408 -1.651e+13
## .clusterid.fctr409 -1.651e+13
## .clusterid.fctr410 -1.651e+13
## .clusterid.fctr411 -1.651e+13
## .clusterid.fctr412 -1.651e+13
## .clusterid.fctr413 -1.651e+13
## .clusterid.fctr414 -1.651e+13
## .clusterid.fctr415 -1.651e+13
## .clusterid.fctr501 3.215e-01
## .clusterid.fctr502 -9.645e-01
## .clusterid.fctr503 -1.225e+00
## .clusterid.fctr504 -1.081e+00
## .clusterid.fctr505 5.805e-01
## .clusterid.fctr506 -6.287e-01
## .clusterid.fctr507 -9.074e-01
## .clusterid.fctr508 -4.791e-02
## .clusterid.fctr509 -4.248e-01
## .clusterid.fctr510 9.234e-02
## .clusterid.fctr511 -8.361e-01
## .clusterid.fctr512 -9.010e-01
## .clusterid.fctr513 NA
## .clusterid.fctr701 8.061e-01
## .clusterid.fctr702 1.083e+00
## .clusterid.fctr703 7.609e-02
## .clusterid.fctr704 4.677e-01
## .clusterid.fctr705 7.901e-01
## .clusterid.fctr706 1.584e+00
## .clusterid.fctr707 NA
## .clusterid.fctr1101 -1.704e+00
## .clusterid.fctr1102 -7.382e-01
## .clusterid.fctr1103 -7.395e-01
## .clusterid.fctr1104 -2.110e+00
## .clusterid.fctr1105 -2.031e+00
## .clusterid.fctr1106 -1.693e+00
## .clusterid.fctr1107 -3.109e+00
## .clusterid.fctr1108 -2.033e+00
## .clusterid.fctr1109 NA
## .clusterid.fctr1501 -2.297e-01
## .clusterid.fctr1502 -5.858e-01
## .clusterid.fctr1503 2.799e-01
## .clusterid.fctr1504 -1.163e+00
## .clusterid.fctr1505 -7.310e-01
## .clusterid.fctr1506 1.007e+00
## .clusterid.fctr1507 -6.823e-01
## .clusterid.fctr1508 -1.655e-01
## .clusterid.fctr1509 -7.944e-03
## .clusterid.fctr1510 -7.315e-01
## .clusterid.fctr1511 -2.605e+01
## .clusterid.fctr1512 4.243e-01
## .clusterid.fctr1513 3.473e-01
## .clusterid.fctr1514 -1.794e+01
## .clusterid.fctr1515 -2.477e+00
## .clusterid.fctr1516 6.251e-01
## .clusterid.fctr1517 1.068e+00
## .clusterid.fctr1518 -1.809e+00
## .clusterid.fctr1519 -1.326e+00
## .clusterid.fctr1520 -1.738e+01
## .clusterid.fctr1521 9.047e-01
## .clusterid.fctr1522 -3.232e-01
## .clusterid.fctr1523 -2.303e+01
## .clusterid.fctr1524 NA
## .clusterid.fctr1801 -1.829e+00
## .clusterid.fctr1802 -1.466e+00
## .clusterid.fctr1803 -1.074e+00
## .clusterid.fctr1804 NA
## A.npnct13.log 7.999e-01
## A.npnct19.log 1.336e+00
## S.nuppr.log -5.234e-01
## S.T.diari 9.831e+00
## H.T.word 2.403e+00
## H.npnct08.log 9.852e-01
## H.T.read -1.134e+00
## H.ndgts.log 3.365e-01
## S.P.metropolitan.diary.colon -4.193e+00
## S.ratio.sum.TfIdf.nwrds 1.620e+00
## A.T.newyork 1.785e+00
## H.nuppr.log 1.144e+00
## S.T.make -6.633e-01
## PubDate.wkday.fctr1 -2.590e-01
## PubDate.wkday.fctr2 -7.000e-01
## PubDate.wkday.fctr3 -3.628e-01
## PubDate.wkday.fctr4 -4.692e-01
## PubDate.wkday.fctr5 -5.438e-01
## PubDate.wkday.fctr6 -7.023e-01
## H.nstopwrds.log -1.099e+00
## H.ratio.nstopwrds.nwrds 4.119e+00
## H.npnct11.log 3.655e-01
## S.T.can -1.474e+00
## H.P.no.comment.colon 2.340e+00
## H.P.friday.night.music -2.962e+00
## A.T.newyorktim 1.799e+00
## S.npnct04.log -9.875e-01
## H.T.newyork -1.124e+00
## S.T.share -1.735e+00
## S.npnct08.log 1.964e-01
## H.sum.TfIdf -1.408e-01
## H.P.recap.colon 1.162e+00
## S.T.one -5.478e-01
## H.npnct07.log 8.625e-02
## PubDate.last10.log 1.393e-01
## H.T.report -1.383e+00
## A.nwrds.unq.log -1.006e+00
## A.T.report -7.503e-01
## `PubDate.hour.fctr(7.67,15.3]` 4.497e-02
## `PubDate.hour.fctr(15.3,23]` 2.026e-01
## A.T.articl -2.146e-01
## A.sum.TfIdf -1.535e-01
## S.nstopwrds.log 3.263e+00
## `PubDate.minute.fctr(14.8,29.5]` -1.886e-02
## `PubDate.minute.fctr(29.5,44.2]` -2.292e-01
## `PubDate.minute.fctr(44.2,59.1]` 5.723e-02
## H.T.polit -1.471e-01
## S.ratio.nstopwrds.nwrds -8.755e+00
## A.T.intern 5.483e-01
## S.T.time -1.097e+00
## H.npnct12.log 2.283e-01
## S.T.take -3.055e-01
## H.T.art -2.849e-01
## H.npnct13.log -1.338e-01
## `PubDate.second.fctr(14.8,29.5]` -1.043e-01
## `PubDate.second.fctr(29.5,44.2]` -4.298e-02
## `PubDate.second.fctr(44.2,59.1]` -2.372e-01
## H.T.week -9.714e-01
## H.T.get 6.684e-01
## S.npnct01.log 2.152e+00
## A.T.will -6.938e-01
## S.T.show -6.315e-01
## H.T.new -5.115e-01
## .rnorm -6.875e-02
## H.ratio.sum.TfIdf.nwrds 2.495e-01
## S.ndgts.log 1.140e-01
## H.T.say -2.984e-01
## A.T.first 8.650e-01
## A.T.photo 5.792e-01
## H.T.china -3.447e-01
## H.npnct01.log -1.038e+00
## H.T.make -4.086e-01
## A.T.senat 3.842e-01
## S.T.said 1.500e+00
## S.T.day -4.556e-01
## H.npnct28.log -9.092e-01
## H.T.news -1.031e+00
## H.npnct16.log 3.906e-01
## H.T.take -3.767e-01
## S.npnct12.log -1.789e-01
## H.T.busi -1.028e+00
## A.T.compani -9.412e-01
## S.npnct11.log -1.191e-01
## H.T.day -2.605e-01
## A.T.word -7.600e-01
## H.P.facts.figures 2.801e+00
## H.T.X2014 -4.679e-01
## PubDate.last1.log -5.733e-03
## S.T.obama -1.370e-01
## `PubDate.date.fctr(7,13]` 9.226e-02
## `PubDate.date.fctr(13,19]` -4.974e-02
## `PubDate.date.fctr(19,25]` -8.867e-03
## `PubDate.date.fctr(25,31]` 1.374e-02
## H.T.big -9.453e-02
## S.npnct14.log 5.957e-01
## A.npnct16.log -2.242e-01
## S.npnct06.log 1.571e+00
## S.T.appear 1.720e-03
## PubDate.last100.log 1.890e-03
## PubDate.wkend 1.408e-02
## H.T.ebola -4.525e-01
## H.nwrds.log -5.301e-02
## H.T.obama -1.383e-01
## A.T.year -8.905e-01
## A.nchrs.log -9.424e-01
## H.T.test 2.701e-01
## A.T.week -4.199e-01
## H.T.pictur 8.802e-02
## S.nwrds.log -6.217e-01
## H.T.newyorktim -7.976e-01
## S.npnct15.log -4.332e-01
## H.T.bank -2.843e-02
## H.T.billion 7.621e-01
## S.T.new -3.024e-01
## A.T.fashion -2.111e+00
## H.P.fashion.week -1.971e+01
## S.T.archiv -6.665e+01
## S.T.herald 7.564e+01
## H.T.springsumm -1.941e+01
## S.T.tribun -7.195e+01
## H.T.deal -1.922e-02
## H.P.first.draft -2.090e+01
## S.npnct28.log -2.125e+01
## H.P.daily.clip.report -2.291e+01
## H.P.today.in.smallbusiness -2.134e+01
## H.P.verbatim.colon -2.015e+01
## S.P.first.draft -2.115e+01
## H.npnct02.log -2.325e+01
## H.P.quandary 2.778e+01
## S.npnct20.log -3.420e+01
## S.npnct03.log -3.893e+01
## A.npnct18.log -3.539e+01
## A.T.presid 1.221e+03
## S.T.presid -1.221e+03
## S.P.year.colon -1.573e+01
## H.P.on.this.day -2.134e+01
## H.npnct05.log 1.348e+00
## S.npnct07.log -3.302e+01
## S.P.fashion.week -1.866e+01
## H.P.s.notebook 2.344e+00
## Std. Error
## (Intercept) 4.224e+00
## WordCount.log 8.977e-02
## H.P.readers.respond 1.025e+00
## `myCategory.fctrForeign#World#Asia Pacific` 9.192e-01
## `myCategory.fctr#Multimedia#` 1.011e+00
## `myCategory.fctrCulture#Arts#` 5.893e+13
## `myCategory.fctrBusiness#Business Day#Dealbook` 9.443e-01
## myCategory.fctrmyOther 1.089e+07
## `myCategory.fctrBusiness#Technology#` 1.056e+00
## `myCategory.fctrBusiness#Crosswords/Games#` 7.436e-01
## `myCategory.fctrTStyle##` 7.665e-01
## `myCategory.fctrForeign#World#` 1.740e+04
## `myCategory.fctrOpEd#Opinion#` 1.300e+00
## `myCategory.fctrStyles##Fashion` 1.305e+00
## `myCategory.fctr#Opinion#Room For Debate` 1.031e+00
## `myCategory.fctr#U.S.#Education` 2.835e+04
## `myCategory.fctr##` 1.552e+00
## `myCategory.fctrMetro#N.Y. / Region#` 7.953e-01
## `myCategory.fctrBusiness#Business Day#Small Business` 8.508e-01
## `myCategory.fctrStyles#U.S.#` 1.249e+00
## `myCategory.fctrTravel#Travel#` 1.210e+00
## `myCategory.fctr#Opinion#The Public Editor` 9.768e-01
## H.npnct19.log 2.839e-01
## H.npnct15.log 3.239e-01
## .clusterid.fctr101 6.965e-01
## .clusterid.fctr102 7.387e-01
## .clusterid.fctr103 8.334e-01
## .clusterid.fctr104 NA
## .clusterid.fctr401 5.893e+13
## .clusterid.fctr402 5.893e+13
## .clusterid.fctr403 5.893e+13
## .clusterid.fctr404 5.893e+13
## .clusterid.fctr405 5.893e+13
## .clusterid.fctr406 5.893e+13
## .clusterid.fctr407 5.893e+13
## .clusterid.fctr408 5.893e+13
## .clusterid.fctr409 5.893e+13
## .clusterid.fctr410 5.893e+13
## .clusterid.fctr411 5.893e+13
## .clusterid.fctr412 5.893e+13
## .clusterid.fctr413 5.893e+13
## .clusterid.fctr414 5.893e+13
## .clusterid.fctr415 5.893e+13
## .clusterid.fctr501 6.936e-01
## .clusterid.fctr502 7.732e-01
## .clusterid.fctr503 8.443e-01
## .clusterid.fctr504 8.580e-01
## .clusterid.fctr505 9.339e-01
## .clusterid.fctr506 7.788e-01
## .clusterid.fctr507 8.492e-01
## .clusterid.fctr508 9.038e-01
## .clusterid.fctr509 9.208e-01
## .clusterid.fctr510 7.964e-01
## .clusterid.fctr511 9.905e-01
## .clusterid.fctr512 9.482e-01
## .clusterid.fctr513 NA
## .clusterid.fctr701 8.890e-01
## .clusterid.fctr702 8.838e-01
## .clusterid.fctr703 1.367e+00
## .clusterid.fctr704 9.241e-01
## .clusterid.fctr705 9.146e-01
## .clusterid.fctr706 9.091e-01
## .clusterid.fctr707 NA
## .clusterid.fctr1101 1.153e+00
## .clusterid.fctr1102 1.174e+00
## .clusterid.fctr1103 1.264e+00
## .clusterid.fctr1104 1.151e+00
## .clusterid.fctr1105 1.196e+00
## .clusterid.fctr1106 1.205e+00
## .clusterid.fctr1107 1.209e+00
## .clusterid.fctr1108 1.198e+00
## .clusterid.fctr1109 NA
## .clusterid.fctr1501 1.483e+00
## .clusterid.fctr1502 1.487e+00
## .clusterid.fctr1503 1.463e+00
## .clusterid.fctr1504 1.579e+00
## .clusterid.fctr1505 1.489e+00
## .clusterid.fctr1506 1.636e+00
## .clusterid.fctr1507 1.651e+00
## .clusterid.fctr1508 1.596e+00
## .clusterid.fctr1509 1.480e+00
## .clusterid.fctr1510 1.565e+00
## .clusterid.fctr1511 5.059e+04
## .clusterid.fctr1512 1.526e+00
## .clusterid.fctr1513 1.565e+00
## .clusterid.fctr1514 3.876e+04
## .clusterid.fctr1515 1.793e+00
## .clusterid.fctr1516 1.493e+00
## .clusterid.fctr1517 1.519e+00
## .clusterid.fctr1518 1.787e+00
## .clusterid.fctr1519 1.544e+00
## .clusterid.fctr1520 5.830e+04
## .clusterid.fctr1521 1.481e+00
## .clusterid.fctr1522 1.522e+00
## .clusterid.fctr1523 7.174e+04
## .clusterid.fctr1524 NA
## .clusterid.fctr1801 1.076e+00
## .clusterid.fctr1802 1.065e+00
## .clusterid.fctr1803 1.148e+00
## .clusterid.fctr1804 NA
## A.npnct13.log 2.477e-01
## A.npnct19.log 3.341e-01
## S.nuppr.log 1.483e-01
## S.T.diari 7.430e+00
## H.T.word 8.963e-01
## H.npnct08.log 4.301e-01
## H.T.read 4.132e-01
## H.ndgts.log 2.377e-01
## S.P.metropolitan.diary.colon 3.360e+00
## S.ratio.sum.TfIdf.nwrds 5.438e-01
## A.T.newyork 9.649e-01
## H.nuppr.log 6.124e-01
## S.T.make 5.458e-01
## PubDate.wkday.fctr1 4.429e-01
## PubDate.wkday.fctr2 4.861e-01
## PubDate.wkday.fctr3 4.794e-01
## PubDate.wkday.fctr4 4.722e-01
## PubDate.wkday.fctr5 4.792e-01
## PubDate.wkday.fctr6 4.355e-01
## H.nstopwrds.log 5.135e-01
## H.ratio.nstopwrds.nwrds 2.315e+00
## H.npnct11.log 1.829e-01
## S.T.can 5.694e-01
## H.P.no.comment.colon 1.007e+00
## H.P.friday.night.music 1.307e+00
## A.T.newyorktim 1.040e+00
## S.npnct04.log 5.655e-01
## H.T.newyork 4.975e-01
## S.T.share 1.005e+00
## S.npnct08.log 6.467e-01
## H.sum.TfIdf 7.767e-02
## H.P.recap.colon 7.334e-01
## S.T.one 5.849e-01
## H.npnct07.log 1.875e-01
## PubDate.last10.log 1.014e-01
## H.T.report 7.134e-01
## A.nwrds.unq.log 7.228e-01
## A.T.report 8.263e-01
## `PubDate.hour.fctr(7.67,15.3]` 2.115e-01
## `PubDate.hour.fctr(15.3,23]` 2.125e-01
## A.T.articl 1.572e+00
## A.sum.TfIdf 9.779e-02
## S.nstopwrds.log 1.476e+00
## `PubDate.minute.fctr(14.8,29.5]` 1.571e-01
## `PubDate.minute.fctr(29.5,44.2]` 1.538e-01
## `PubDate.minute.fctr(44.2,59.1]` 1.592e-01
## H.T.polit 3.689e-01
## S.ratio.nstopwrds.nwrds 4.365e+00
## A.T.intern 1.518e+00
## S.T.time 6.913e-01
## H.npnct12.log 2.702e-01
## S.T.take 8.667e-01
## H.T.art 6.787e-01
## H.npnct13.log 1.951e-01
## `PubDate.second.fctr(14.8,29.5]` 1.516e-01
## `PubDate.second.fctr(29.5,44.2]` 1.500e-01
## `PubDate.second.fctr(44.2,59.1]` 1.525e-01
## H.T.week 6.903e-01
## H.T.get 3.716e-01
## S.npnct01.log 1.236e+00
## A.T.will 6.686e-01
## S.T.show 7.838e-01
## H.T.new 3.909e-01
## .rnorm 5.425e-02
## H.ratio.sum.TfIdf.nwrds 1.651e-01
## S.ndgts.log 1.835e-01
## H.T.say 3.747e-01
## A.T.first 9.014e-01
## A.T.photo 1.838e+00
## H.T.china 6.049e-01
## H.npnct01.log 1.004e+00
## H.T.make 3.041e-01
## A.T.senat 8.443e-01
## S.T.said 7.378e-01
## S.T.day 8.992e-01
## H.npnct28.log 1.699e+00
## H.T.news 8.017e-01
## H.npnct16.log 5.333e-01
## H.T.take 3.854e-01
## S.npnct12.log 1.858e-01
## H.T.busi 7.819e-01
## A.T.compani 7.349e-01
## S.npnct11.log 1.401e-01
## H.T.day 4.847e-01
## A.T.word 1.056e+00
## H.P.facts.figures 1.261e+00
## H.T.X2014 9.614e-01
## PubDate.last1.log 3.845e-02
## S.T.obama 1.117e+00
## `PubDate.date.fctr(7,13]` 1.673e-01
## `PubDate.date.fctr(13,19]` 1.684e-01
## `PubDate.date.fctr(19,25]` 1.649e-01
## `PubDate.date.fctr(25,31]` 1.801e-01
## H.T.big 3.810e-01
## S.npnct14.log 1.606e+00
## A.npnct16.log 1.119e+00
## S.npnct06.log 8.419e-01
## S.T.appear 1.111e+00
## PubDate.last100.log 3.746e-02
## PubDate.wkend 3.641e-01
## H.T.ebola 2.755e-01
## H.nwrds.log 8.181e-01
## H.T.obama 4.290e-01
## A.T.year 7.786e-01
## A.nchrs.log 7.118e-01
## H.T.test 6.129e-01
## A.T.week 8.452e-01
## H.T.pictur 6.174e-01
## S.nwrds.log 1.705e+00
## H.T.newyorktim 1.056e+00
## S.npnct15.log 4.678e-01
## H.T.bank 4.081e-01
## H.T.billion 5.466e-01
## S.T.new 6.675e-01
## A.T.fashion 3.052e+00
## H.P.fashion.week 1.871e+04
## S.T.archiv 5.889e+04
## S.T.herald 5.652e+04
## H.T.springsumm 2.279e+04
## S.T.tribun 6.424e+04
## H.T.deal 4.878e-01
## H.P.first.draft 3.496e+04
## S.npnct28.log 3.360e+04
## H.P.daily.clip.report 4.403e+04
## H.P.today.in.smallbusiness 4.431e+04
## H.P.verbatim.colon 5.543e+04
## S.P.first.draft 7.145e+04
## H.npnct02.log 7.008e+04
## H.P.quandary 1.029e+05
## S.npnct20.log 1.371e+05
## S.npnct03.log 1.747e+05
## A.npnct18.log 1.367e+05
## A.T.presid 3.329e+06
## S.T.presid 3.329e+06
## S.P.year.colon 6.947e+04
## H.P.on.this.day 8.706e+04
## H.npnct05.log 1.717e+00
## S.npnct07.log 1.911e+05
## S.P.fashion.week 2.224e+04
## H.P.s.notebook 1.248e+00
## z value Pr(>|z|)
## (Intercept) -7.900e-01 0.429302
## WordCount.log 1.481e+01 < 2e-16
## H.P.readers.respond 7.046e+00 1.84e-12
## `myCategory.fctrForeign#World#Asia Pacific` -5.639e+00 1.71e-08
## `myCategory.fctr#Multimedia#` -5.055e+00 4.30e-07
## `myCategory.fctrCulture#Arts#` 2.800e-01 0.779415
## `myCategory.fctrBusiness#Business Day#Dealbook` -3.133e+00 0.001729
## myCategory.fctrmyOther -4.135e+08 < 2e-16
## `myCategory.fctrBusiness#Technology#` -3.235e+00 0.001218
## `myCategory.fctrBusiness#Crosswords/Games#` 4.360e-01 0.662993
## `myCategory.fctrTStyle##` -6.293e+00 3.11e-10
## `myCategory.fctrForeign#World#` -1.000e-03 0.998879
## `myCategory.fctrOpEd#Opinion#` 1.692e+00 0.090630
## `myCategory.fctrStyles##Fashion` -4.358e+00 1.31e-05
## `myCategory.fctr#Opinion#Room For Debate` -7.208e+00 5.66e-13
## `myCategory.fctr#U.S.#Education` -1.000e-03 0.999102
## `myCategory.fctr##` -1.774e+00 0.076058
## `myCategory.fctrMetro#N.Y. / Region#` -2.470e+00 0.013524
## `myCategory.fctrBusiness#Business Day#Small Business` -5.289e+00 1.23e-07
## `myCategory.fctrStyles#U.S.#` 3.290e-01 0.741846
## `myCategory.fctrTravel#Travel#` -3.730e+00 0.000192
## `myCategory.fctr#Opinion#The Public Editor` 1.540e-01 0.877779
## H.npnct19.log 5.093e+00 3.53e-07
## H.npnct15.log -4.163e+00 3.14e-05
## .clusterid.fctr101 -1.480e-01 0.882655
## .clusterid.fctr102 -7.300e-01 0.465373
## .clusterid.fctr103 -3.740e-01 0.708185
## .clusterid.fctr104 NA NA
## .clusterid.fctr401 -2.800e-01 0.779415
## .clusterid.fctr402 -7.670e+01 < 2e-16
## .clusterid.fctr403 -2.800e-01 0.779415
## .clusterid.fctr404 -2.800e-01 0.779415
## .clusterid.fctr405 -2.800e-01 0.779415
## .clusterid.fctr406 -2.800e-01 0.779415
## .clusterid.fctr407 -7.670e+01 < 2e-16
## .clusterid.fctr408 -2.800e-01 0.779415
## .clusterid.fctr409 -2.800e-01 0.779415
## .clusterid.fctr410 -2.800e-01 0.779415
## .clusterid.fctr411 -2.800e-01 0.779415
## .clusterid.fctr412 -2.800e-01 0.779415
## .clusterid.fctr413 -2.800e-01 0.779415
## .clusterid.fctr414 -2.800e-01 0.779415
## .clusterid.fctr415 -2.800e-01 0.779415
## .clusterid.fctr501 4.640e-01 0.642942
## .clusterid.fctr502 -1.247e+00 0.212264
## .clusterid.fctr503 -1.451e+00 0.146823
## .clusterid.fctr504 -1.260e+00 0.207754
## .clusterid.fctr505 6.220e-01 0.534179
## .clusterid.fctr506 -8.070e-01 0.419525
## .clusterid.fctr507 -1.069e+00 0.285286
## .clusterid.fctr508 -5.300e-02 0.957720
## .clusterid.fctr509 -4.610e-01 0.644527
## .clusterid.fctr510 1.160e-01 0.907692
## .clusterid.fctr511 -8.440e-01 0.398606
## .clusterid.fctr512 -9.500e-01 0.341984
## .clusterid.fctr513 NA NA
## .clusterid.fctr701 9.070e-01 0.364504
## .clusterid.fctr702 1.226e+00 0.220233
## .clusterid.fctr703 5.600e-02 0.955613
## .clusterid.fctr704 5.060e-01 0.612776
## .clusterid.fctr705 8.640e-01 0.387678
## .clusterid.fctr706 1.742e+00 0.081481
## .clusterid.fctr707 NA NA
## .clusterid.fctr1101 -1.478e+00 0.139499
## .clusterid.fctr1102 -6.290e-01 0.529550
## .clusterid.fctr1103 -5.850e-01 0.558454
## .clusterid.fctr1104 -1.834e+00 0.066647
## .clusterid.fctr1105 -1.698e+00 0.089533
## .clusterid.fctr1106 -1.405e+00 0.160045
## .clusterid.fctr1107 -2.572e+00 0.010104
## .clusterid.fctr1108 -1.696e+00 0.089866
## .clusterid.fctr1109 NA NA
## .clusterid.fctr1501 -1.550e-01 0.876901
## .clusterid.fctr1502 -3.940e-01 0.693546
## .clusterid.fctr1503 1.910e-01 0.848333
## .clusterid.fctr1504 -7.370e-01 0.461256
## .clusterid.fctr1505 -4.910e-01 0.623474
## .clusterid.fctr1506 6.150e-01 0.538239
## .clusterid.fctr1507 -4.130e-01 0.679342
## .clusterid.fctr1508 -1.040e-01 0.917382
## .clusterid.fctr1509 -5.000e-03 0.995717
## .clusterid.fctr1510 -4.670e-01 0.640159
## .clusterid.fctr1511 -1.000e-03 0.999589
## .clusterid.fctr1512 2.780e-01 0.780959
## .clusterid.fctr1513 2.220e-01 0.824399
## .clusterid.fctr1514 0.000e+00 0.999631
## .clusterid.fctr1515 -1.381e+00 0.167166
## .clusterid.fctr1516 4.190e-01 0.675356
## .clusterid.fctr1517 7.030e-01 0.481991
## .clusterid.fctr1518 -1.012e+00 0.311431
## .clusterid.fctr1519 -8.590e-01 0.390448
## .clusterid.fctr1520 0.000e+00 0.999762
## .clusterid.fctr1521 6.110e-01 0.541169
## .clusterid.fctr1522 -2.120e-01 0.831815
## .clusterid.fctr1523 0.000e+00 0.999744
## .clusterid.fctr1524 NA NA
## .clusterid.fctr1801 -1.700e+00 0.089103
## .clusterid.fctr1802 -1.376e+00 0.168743
## .clusterid.fctr1803 -9.360e-01 0.349379
## .clusterid.fctr1804 NA NA
## A.npnct13.log 3.229e+00 0.001243
## A.npnct19.log 3.999e+00 6.36e-05
## S.nuppr.log -3.531e+00 0.000415
## S.T.diari 1.323e+00 0.185801
## H.T.word 2.681e+00 0.007349
## H.npnct08.log 2.291e+00 0.021979
## H.T.read -2.746e+00 0.006040
## H.ndgts.log 1.416e+00 0.156909
## S.P.metropolitan.diary.colon -1.248e+00 0.212076
## S.ratio.sum.TfIdf.nwrds 2.979e+00 0.002894
## A.T.newyork 1.850e+00 0.064346
## H.nuppr.log 1.868e+00 0.061807
## S.T.make -1.215e+00 0.224243
## PubDate.wkday.fctr1 -5.850e-01 0.558704
## PubDate.wkday.fctr2 -1.440e+00 0.149882
## PubDate.wkday.fctr3 -7.570e-01 0.449234
## PubDate.wkday.fctr4 -9.940e-01 0.320406
## PubDate.wkday.fctr5 -1.135e+00 0.256459
## PubDate.wkday.fctr6 -1.613e+00 0.106805
## H.nstopwrds.log -2.141e+00 0.032313
## H.ratio.nstopwrds.nwrds 1.779e+00 0.075187
## H.npnct11.log 1.998e+00 0.045678
## S.T.can -2.589e+00 0.009621
## H.P.no.comment.colon 2.325e+00 0.020091
## H.P.friday.night.music -2.266e+00 0.023433
## A.T.newyorktim 1.730e+00 0.083647
## S.npnct04.log -1.746e+00 0.080791
## H.T.newyork -2.260e+00 0.023835
## S.T.share -1.726e+00 0.084433
## S.npnct08.log 3.040e-01 0.761380
## H.sum.TfIdf -1.813e+00 0.069785
## H.P.recap.colon 1.585e+00 0.113029
## S.T.one -9.370e-01 0.348961
## H.npnct07.log 4.600e-01 0.645561
## PubDate.last10.log 1.373e+00 0.169713
## H.T.report -1.938e+00 0.052627
## A.nwrds.unq.log -1.391e+00 0.164085
## A.T.report -9.080e-01 0.363880
## `PubDate.hour.fctr(7.67,15.3]` 2.130e-01 0.831614
## `PubDate.hour.fctr(15.3,23]` 9.540e-01 0.340316
## A.T.articl -1.360e-01 0.891439
## A.sum.TfIdf -1.570e+00 0.116490
## S.nstopwrds.log 2.210e+00 0.027071
## `PubDate.minute.fctr(14.8,29.5]` -1.200e-01 0.904473
## `PubDate.minute.fctr(29.5,44.2]` -1.491e+00 0.136053
## `PubDate.minute.fctr(44.2,59.1]` 3.590e-01 0.719240
## H.T.polit -3.990e-01 0.690136
## S.ratio.nstopwrds.nwrds -2.006e+00 0.044877
## A.T.intern 3.610e-01 0.717929
## S.T.time -1.587e+00 0.112543
## H.npnct12.log 8.450e-01 0.398144
## S.T.take -3.520e-01 0.724493
## H.T.art -4.200e-01 0.674610
## H.npnct13.log -6.860e-01 0.492966
## `PubDate.second.fctr(14.8,29.5]` -6.880e-01 0.491559
## `PubDate.second.fctr(29.5,44.2]` -2.860e-01 0.774508
## `PubDate.second.fctr(44.2,59.1]` -1.555e+00 0.119858
## H.T.week -1.407e+00 0.159320
## H.T.get 1.799e+00 0.072069
## S.npnct01.log 1.742e+00 0.081543
## A.T.will -1.038e+00 0.299440
## S.T.show -8.060e-01 0.420398
## H.T.new -1.309e+00 0.190644
## .rnorm -1.267e+00 0.204996
## H.ratio.sum.TfIdf.nwrds 1.512e+00 0.130660
## S.ndgts.log 6.210e-01 0.534413
## H.T.say -7.960e-01 0.425754
## A.T.first 9.600e-01 0.337230
## A.T.photo 3.150e-01 0.752658
## H.T.china -5.700e-01 0.568823
## H.npnct01.log -1.034e+00 0.301218
## H.T.make -1.343e+00 0.179187
## A.T.senat 4.550e-01 0.649051
## S.T.said 2.033e+00 0.042097
## S.T.day -5.070e-01 0.612432
## H.npnct28.log -5.350e-01 0.592600
## H.T.news -1.286e+00 0.198286
## H.npnct16.log 7.320e-01 0.463970
## H.T.take -9.770e-01 0.328376
## S.npnct12.log -9.630e-01 0.335415
## H.T.busi -1.315e+00 0.188575
## A.T.compani -1.281e+00 0.200309
## S.npnct11.log -8.510e-01 0.395000
## H.T.day -5.380e-01 0.590885
## A.T.word -7.200e-01 0.471701
## H.P.facts.figures 2.220e+00 0.026391
## H.T.X2014 -4.870e-01 0.626496
## PubDate.last1.log -1.490e-01 0.881467
## S.T.obama -1.230e-01 0.902339
## `PubDate.date.fctr(7,13]` 5.510e-01 0.581356
## `PubDate.date.fctr(13,19]` -2.950e-01 0.767713
## `PubDate.date.fctr(19,25]` -5.400e-02 0.957116
## `PubDate.date.fctr(25,31]` 7.600e-02 0.939194
## H.T.big -2.480e-01 0.804025
## S.npnct14.log 3.710e-01 0.710658
## A.npnct16.log -2.000e-01 0.841238
## S.npnct06.log 1.866e+00 0.062072
## S.T.appear 2.000e-03 0.998764
## PubDate.last100.log 5.000e-02 0.959752
## PubDate.wkend 3.900e-02 0.969157
## H.T.ebola -1.642e+00 0.100491
## H.nwrds.log -6.500e-02 0.948334
## H.T.obama -3.220e-01 0.747185
## A.T.year -1.144e+00 0.252740
## A.nchrs.log -1.324e+00 0.185529
## H.T.test 4.410e-01 0.659452
## A.T.week -4.970e-01 0.619327
## H.T.pictur 1.430e-01 0.886626
## S.nwrds.log -3.650e-01 0.715391
## H.T.newyorktim -7.550e-01 0.450245
## S.npnct15.log -9.260e-01 0.354419
## H.T.bank -7.000e-02 0.944458
## H.T.billion 1.394e+00 0.163277
## S.T.new -4.530e-01 0.650563
## A.T.fashion -6.920e-01 0.489063
## H.P.fashion.week -1.000e-03 0.999160
## S.T.archiv -1.000e-03 0.999097
## S.T.herald 1.000e-03 0.998932
## H.T.springsumm -1.000e-03 0.999321
## S.T.tribun -1.000e-03 0.999106
## H.T.deal -3.900e-02 0.968565
## H.P.first.draft -1.000e-03 0.999523
## S.npnct28.log -1.000e-03 0.999495
## H.P.daily.clip.report -1.000e-03 0.999585
## H.P.today.in.smallbusiness 0.000e+00 0.999616
## H.P.verbatim.colon 0.000e+00 0.999710
## S.P.first.draft 0.000e+00 0.999764
## H.npnct02.log 0.000e+00 0.999735
## H.P.quandary 0.000e+00 0.999785
## S.npnct20.log 0.000e+00 0.999801
## S.npnct03.log 0.000e+00 0.999822
## A.npnct18.log 0.000e+00 0.999793
## A.T.presid 0.000e+00 0.999707
## S.T.presid 0.000e+00 0.999707
## S.P.year.colon 0.000e+00 0.999819
## H.P.on.this.day 0.000e+00 0.999804
## H.npnct05.log 7.850e-01 0.432527
## S.npnct07.log 0.000e+00 0.999862
## S.P.fashion.week -1.000e-03 0.999331
## H.P.s.notebook 1.878e+00 0.060401
##
## (Intercept)
## WordCount.log ***
## H.P.readers.respond ***
## `myCategory.fctrForeign#World#Asia Pacific` ***
## `myCategory.fctr#Multimedia#` ***
## `myCategory.fctrCulture#Arts#`
## `myCategory.fctrBusiness#Business Day#Dealbook` **
## myCategory.fctrmyOther ***
## `myCategory.fctrBusiness#Technology#` **
## `myCategory.fctrBusiness#Crosswords/Games#`
## `myCategory.fctrTStyle##` ***
## `myCategory.fctrForeign#World#`
## `myCategory.fctrOpEd#Opinion#` .
## `myCategory.fctrStyles##Fashion` ***
## `myCategory.fctr#Opinion#Room For Debate` ***
## `myCategory.fctr#U.S.#Education`
## `myCategory.fctr##` .
## `myCategory.fctrMetro#N.Y. / Region#` *
## `myCategory.fctrBusiness#Business Day#Small Business` ***
## `myCategory.fctrStyles#U.S.#`
## `myCategory.fctrTravel#Travel#` ***
## `myCategory.fctr#Opinion#The Public Editor`
## H.npnct19.log ***
## H.npnct15.log ***
## .clusterid.fctr101
## .clusterid.fctr102
## .clusterid.fctr103
## .clusterid.fctr104
## .clusterid.fctr401
## .clusterid.fctr402 ***
## .clusterid.fctr403
## .clusterid.fctr404
## .clusterid.fctr405
## .clusterid.fctr406
## .clusterid.fctr407 ***
## .clusterid.fctr408
## .clusterid.fctr409
## .clusterid.fctr410
## .clusterid.fctr411
## .clusterid.fctr412
## .clusterid.fctr413
## .clusterid.fctr414
## .clusterid.fctr415
## .clusterid.fctr501
## .clusterid.fctr502
## .clusterid.fctr503
## .clusterid.fctr504
## .clusterid.fctr505
## .clusterid.fctr506
## .clusterid.fctr507
## .clusterid.fctr508
## .clusterid.fctr509
## .clusterid.fctr510
## .clusterid.fctr511
## .clusterid.fctr512
## .clusterid.fctr513
## .clusterid.fctr701
## .clusterid.fctr702
## .clusterid.fctr703
## .clusterid.fctr704
## .clusterid.fctr705
## .clusterid.fctr706 .
## .clusterid.fctr707
## .clusterid.fctr1101
## .clusterid.fctr1102
## .clusterid.fctr1103
## .clusterid.fctr1104 .
## .clusterid.fctr1105 .
## .clusterid.fctr1106
## .clusterid.fctr1107 *
## .clusterid.fctr1108 .
## .clusterid.fctr1109
## .clusterid.fctr1501
## .clusterid.fctr1502
## .clusterid.fctr1503
## .clusterid.fctr1504
## .clusterid.fctr1505
## .clusterid.fctr1506
## .clusterid.fctr1507
## .clusterid.fctr1508
## .clusterid.fctr1509
## .clusterid.fctr1510
## .clusterid.fctr1511
## .clusterid.fctr1512
## .clusterid.fctr1513
## .clusterid.fctr1514
## .clusterid.fctr1515
## .clusterid.fctr1516
## .clusterid.fctr1517
## .clusterid.fctr1518
## .clusterid.fctr1519
## .clusterid.fctr1520
## .clusterid.fctr1521
## .clusterid.fctr1522
## .clusterid.fctr1523
## .clusterid.fctr1524
## .clusterid.fctr1801 .
## .clusterid.fctr1802
## .clusterid.fctr1803
## .clusterid.fctr1804
## A.npnct13.log **
## A.npnct19.log ***
## S.nuppr.log ***
## S.T.diari
## H.T.word **
## H.npnct08.log *
## H.T.read **
## H.ndgts.log
## S.P.metropolitan.diary.colon
## S.ratio.sum.TfIdf.nwrds **
## A.T.newyork .
## H.nuppr.log .
## S.T.make
## PubDate.wkday.fctr1
## PubDate.wkday.fctr2
## PubDate.wkday.fctr3
## PubDate.wkday.fctr4
## PubDate.wkday.fctr5
## PubDate.wkday.fctr6
## H.nstopwrds.log *
## H.ratio.nstopwrds.nwrds .
## H.npnct11.log *
## S.T.can **
## H.P.no.comment.colon *
## H.P.friday.night.music *
## A.T.newyorktim .
## S.npnct04.log .
## H.T.newyork *
## S.T.share .
## S.npnct08.log
## H.sum.TfIdf .
## H.P.recap.colon
## S.T.one
## H.npnct07.log
## PubDate.last10.log
## H.T.report .
## A.nwrds.unq.log
## A.T.report
## `PubDate.hour.fctr(7.67,15.3]`
## `PubDate.hour.fctr(15.3,23]`
## A.T.articl
## A.sum.TfIdf
## S.nstopwrds.log *
## `PubDate.minute.fctr(14.8,29.5]`
## `PubDate.minute.fctr(29.5,44.2]`
## `PubDate.minute.fctr(44.2,59.1]`
## H.T.polit
## S.ratio.nstopwrds.nwrds *
## A.T.intern
## S.T.time
## H.npnct12.log
## S.T.take
## H.T.art
## H.npnct13.log
## `PubDate.second.fctr(14.8,29.5]`
## `PubDate.second.fctr(29.5,44.2]`
## `PubDate.second.fctr(44.2,59.1]`
## H.T.week
## H.T.get .
## S.npnct01.log .
## A.T.will
## S.T.show
## H.T.new
## .rnorm
## H.ratio.sum.TfIdf.nwrds
## S.ndgts.log
## H.T.say
## A.T.first
## A.T.photo
## H.T.china
## H.npnct01.log
## H.T.make
## A.T.senat
## S.T.said *
## S.T.day
## H.npnct28.log
## H.T.news
## H.npnct16.log
## H.T.take
## S.npnct12.log
## H.T.busi
## A.T.compani
## S.npnct11.log
## H.T.day
## A.T.word
## H.P.facts.figures *
## H.T.X2014
## PubDate.last1.log
## S.T.obama
## `PubDate.date.fctr(7,13]`
## `PubDate.date.fctr(13,19]`
## `PubDate.date.fctr(19,25]`
## `PubDate.date.fctr(25,31]`
## H.T.big
## S.npnct14.log
## A.npnct16.log
## S.npnct06.log .
## S.T.appear
## PubDate.last100.log
## PubDate.wkend
## H.T.ebola
## H.nwrds.log
## H.T.obama
## A.T.year
## A.nchrs.log
## H.T.test
## A.T.week
## H.T.pictur
## S.nwrds.log
## H.T.newyorktim
## S.npnct15.log
## H.T.bank
## H.T.billion
## S.T.new
## A.T.fashion
## H.P.fashion.week
## S.T.archiv
## S.T.herald
## H.T.springsumm
## S.T.tribun
## H.T.deal
## H.P.first.draft
## S.npnct28.log
## H.P.daily.clip.report
## H.P.today.in.smallbusiness
## H.P.verbatim.colon
## S.P.first.draft
## H.npnct02.log
## H.P.quandary
## S.npnct20.log
## S.npnct03.log
## A.npnct18.log
## A.T.presid
## S.T.presid
## S.P.year.colon
## H.P.on.this.day
## H.npnct05.log
## S.npnct07.log
## S.P.fashion.week
## H.P.s.notebook .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5900.1 on 6531 degrees of freedom
## Residual deviance: 2476.8 on 6298 degrees of freedom
## AIC: 2944.8
##
## Number of Fisher Scoring iterations: 25
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2866885
## 2 0.1 0.6780216
## 3 0.2 0.7429889
## 4 0.3 0.7626392
## 5 0.4 0.7727691
## 6 0.5 0.7635830
## 7 0.6 0.7341641
## 8 0.7 0.6818434
## 9 0.8 0.5981763
## 10 0.9 0.4502762
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Final.glm.N
## 1 N 5198
## 2 Y 253
## Popular.fctr.predict.Final.glm.Y
## 1 241
## 2 840
## Prediction
## Reference N Y
## N 5198 241
## Y 253 840
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.243723e-01 7.274080e-01 9.176925e-01 9.306703e-01 8.326699e-01
## AccuracyPValue McnemarPValue
## 8.059885e-106 6.206613e-01
## Warning in mypredict_mdl(mdl, df = fit_df, rsp_var, rsp_var_out,
## model_id_method, : Expecting 1 metric: Accuracy; recd: Accuracy, Kappa;
## retaining Accuracy only
## model_id model_method
## 1 Final.glm glm
## feats
## 1 WordCount.log, H.P.readers.respond, myCategory.fctr, H.npnct19.log, H.npnct15.log, .clusterid.fctr, A.npnct13.log, A.npnct19.log, S.nuppr.log, S.T.diari, H.T.word, H.npnct08.log, H.T.read, H.ndgts.log, S.P.metropolitan.diary.colon, S.ratio.sum.TfIdf.nwrds, A.T.newyork, H.nuppr.log, S.T.make, PubDate.wkday.fctr, H.nstopwrds.log, H.ratio.nstopwrds.nwrds, H.npnct11.log, S.T.can, H.P.no.comment.colon, H.P.friday.night.music, A.T.newyorktim, S.npnct04.log, H.T.newyork, S.T.share, S.npnct08.log, H.sum.TfIdf, H.P.recap.colon, S.T.one, H.npnct07.log, PubDate.last10.log, H.T.report, A.nwrds.unq.log, A.T.report, PubDate.hour.fctr, A.T.articl, A.sum.TfIdf, S.nstopwrds.log, PubDate.minute.fctr, H.T.polit, S.ratio.nstopwrds.nwrds, A.T.intern, S.T.time, H.npnct12.log, S.T.take, H.T.art, H.npnct13.log, PubDate.second.fctr, H.T.week, H.T.get, S.npnct01.log, A.T.will, S.T.show, H.T.new, .rnorm, H.ratio.sum.TfIdf.nwrds, S.ndgts.log, H.T.say, A.T.first, A.T.photo, H.T.china, H.npnct01.log, H.T.make, A.T.senat, S.T.said, S.T.day, H.npnct28.log, H.T.news, H.npnct16.log, H.T.take, S.npnct12.log, H.T.busi, A.T.compani, S.npnct11.log, H.T.day, A.T.word, H.P.facts.figures, H.T.X2014, PubDate.last1.log, S.T.obama, PubDate.date.fctr, H.T.big, S.npnct14.log, A.npnct16.log, S.npnct06.log, S.T.appear, PubDate.last100.log, PubDate.wkend, H.T.ebola, H.nwrds.log, H.T.obama, A.T.year, A.nchrs.log, H.T.test, A.T.week, H.T.pictur, S.nwrds.log, H.T.newyorktim, S.npnct15.log, H.T.bank, H.T.billion, S.T.new, A.T.fashion, H.P.fashion.week, S.T.archiv, S.T.herald, H.T.springsumm, S.T.tribun, H.T.deal, H.P.first.draft, S.npnct28.log, H.P.daily.clip.report, H.P.today.in.smallbusiness, H.P.verbatim.colon, S.P.first.draft, H.npnct02.log, H.P.quandary, S.npnct20.log, S.npnct03.log, A.npnct18.log, A.T.presid, S.T.presid, S.P.year.colon, H.P.on.this.day, H.npnct05.log, S.npnct07.log, S.P.fashion.week, H.P.s.notebook
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 35.58 19.035
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.9584716 0.4 0.7727691 0.8770685
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit min.aic.fit
## 1 0.9176925 0.9306703 0.5573295 2944.841
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01392461 0.046206
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 14 fit.data.training 8 0 514.310 558.729 44.419
## 15 fit.data.training 8 1 558.729 NA NA
glb_trnobs_df <- glb_get_predictions(df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : Using default probability threshold: 0.3
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
glb_feats_df <- mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_fin_mdl,
entity_df=glb_trnobs_df)
glb_feats_df[, paste0(glb_fin_mdl_id, ".importance")] <- glb_feats_df$importance
print(glb_feats_df)
## id importance
## WordCount.log WordCount.log 1.000000e+02
## H.P.readers.respond H.P.readers.respond 5.146747e+01
## myCategory.fctr myCategory.fctr 4.281250e+01
## H.npnct19.log H.npnct19.log 4.004695e+01
## H.npnct15.log H.npnct15.log 3.467378e+01
## .clusterid.fctr .clusterid.fctr 2.932308e+01
## A.npnct13.log A.npnct13.log 2.871074e+01
## A.npnct19.log A.npnct19.log 2.850228e+01
## S.nuppr.log S.nuppr.log 2.501950e+01
## S.T.diari S.T.diari 2.124067e+01
## H.T.word H.T.word 2.105883e+01
## H.npnct08.log H.npnct08.log 2.045852e+01
## H.T.read H.T.read 2.019170e+01
## H.ndgts.log H.ndgts.log 1.951144e+01
## S.P.metropolitan.diary.colon S.P.metropolitan.diary.colon 1.888990e+01
## S.ratio.sum.TfIdf.nwrds S.ratio.sum.TfIdf.nwrds 1.854118e+01
## A.T.newyork A.T.newyork 1.825819e+01
## H.nuppr.log H.nuppr.log 1.817153e+01
## S.T.make S.T.make 1.723536e+01
## PubDate.wkday.fctr PubDate.wkday.fctr 1.661497e+01
## H.nstopwrds.log H.nstopwrds.log 1.627434e+01
## H.ratio.nstopwrds.nwrds H.ratio.nstopwrds.nwrds 1.623508e+01
## H.npnct11.log H.npnct11.log 1.582943e+01
## S.T.can S.T.can 1.557214e+01
## H.P.no.comment.colon H.P.no.comment.colon 1.552028e+01
## H.P.friday.night.music H.P.friday.night.music 1.488096e+01
## A.T.newyorktim A.T.newyorktim 1.471359e+01
## S.npnct04.log S.npnct04.log 1.357564e+01
## H.T.newyork H.T.newyork 1.337672e+01
## S.T.share S.T.share 1.322366e+01
## S.npnct08.log S.npnct08.log 1.319408e+01
## H.sum.TfIdf H.sum.TfIdf 1.282767e+01
## H.P.recap.colon H.P.recap.colon 1.273820e+01
## S.T.one S.T.one 1.238107e+01
## H.npnct07.log H.npnct07.log 1.222449e+01
## PubDate.last10.log PubDate.last10.log 1.193952e+01
## H.T.report H.T.report 1.191324e+01
## A.nwrds.unq.log A.nwrds.unq.log 1.190743e+01
## A.T.report A.T.report 1.155728e+01
## PubDate.hour.fctr PubDate.hour.fctr 1.144174e+01
## A.T.articl A.T.articl 1.128851e+01
## A.sum.TfIdf A.sum.TfIdf 1.110991e+01
## S.nstopwrds.log S.nstopwrds.log 1.099737e+01
## PubDate.minute.fctr PubDate.minute.fctr 1.055593e+01
## H.T.polit H.T.polit 1.032948e+01
## S.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.026798e+01
## A.T.intern A.T.intern 9.914726e+00
## S.T.time S.T.time 9.904830e+00
## H.npnct12.log H.npnct12.log 9.888614e+00
## S.T.take S.T.take 9.852005e+00
## H.T.art H.T.art 9.842033e+00
## H.npnct13.log H.npnct13.log 9.829552e+00
## PubDate.second.fctr PubDate.second.fctr 9.794379e+00
## H.T.week H.T.week 9.718351e+00
## H.T.get H.T.get 9.576775e+00
## S.npnct01.log S.npnct01.log 9.564919e+00
## A.T.will A.T.will 9.491070e+00
## S.T.show S.T.show 9.342530e+00
## H.T.new H.T.new 9.203584e+00
## .rnorm .rnorm 8.941026e+00
## H.ratio.sum.TfIdf.nwrds H.ratio.sum.TfIdf.nwrds 8.939183e+00
## S.ndgts.log S.ndgts.log 8.755923e+00
## H.T.say H.T.say 8.671813e+00
## A.T.first A.T.first 8.505413e+00
## A.T.photo A.T.photo 8.036467e+00
## H.T.china H.T.china 7.799595e+00
## H.npnct01.log H.npnct01.log 7.787548e+00
## H.T.make H.T.make 7.674641e+00
## A.T.senat A.T.senat 7.437682e+00
## S.T.said S.T.said 7.364645e+00
## S.T.day S.T.day 7.260435e+00
## H.npnct28.log H.npnct28.log 7.016220e+00
## H.T.news H.T.news 6.901869e+00
## H.npnct16.log H.npnct16.log 6.896467e+00
## H.T.take H.T.take 6.881392e+00
## S.npnct12.log S.npnct12.log 6.296543e+00
## H.T.busi H.T.busi 5.892173e+00
## A.T.compani A.T.compani 5.701348e+00
## S.npnct11.log S.npnct11.log 5.458567e+00
## H.T.day H.T.day 5.341694e+00
## A.T.word A.T.word 5.341188e+00
## H.P.facts.figures H.P.facts.figures 5.332319e+00
## H.T.X2014 H.T.X2014 5.105515e+00
## PubDate.last1.log PubDate.last1.log 5.029197e+00
## S.T.obama S.T.obama 4.950818e+00
## PubDate.date.fctr PubDate.date.fctr 4.692266e+00
## H.T.big H.T.big 4.491165e+00
## S.npnct14.log S.npnct14.log 4.449833e+00
## A.npnct16.log A.npnct16.log 4.218694e+00
## S.npnct06.log S.npnct06.log 3.899006e+00
## S.T.appear S.T.appear 3.840502e+00
## PubDate.last100.log PubDate.last100.log 3.387278e+00
## PubDate.wkend PubDate.wkend 3.323031e+00
## H.T.ebola H.T.ebola 3.198159e+00
## H.nwrds.log H.nwrds.log 3.088101e+00
## H.T.obama H.T.obama 2.703165e+00
## A.T.year A.T.year 2.411870e+00
## A.nchrs.log A.nchrs.log 2.210854e+00
## H.T.test H.T.test 1.856699e+00
## A.T.week A.T.week 1.735453e+00
## H.T.pictur H.T.pictur 1.728117e+00
## S.nwrds.log S.nwrds.log 1.511344e+00
## H.T.newyorktim H.T.newyorktim 1.280498e+00
## S.npnct15.log S.npnct15.log 1.190329e+00
## H.T.bank H.T.bank 1.038507e+00
## H.T.billion H.T.billion 7.999936e-01
## S.T.new S.T.new 1.565429e-01
## A.T.fashion A.T.fashion 1.528913e-01
## H.P.fashion.week H.P.fashion.week 1.109378e-01
## S.T.archiv S.T.archiv 9.386317e-02
## S.T.herald S.T.herald 8.612681e-02
## H.T.springsumm H.T.springsumm 7.774263e-02
## S.T.tribun S.T.tribun 7.345077e-02
## H.T.deal H.T.deal 6.047754e-02
## H.P.first.draft H.P.first.draft 4.572882e-02
## S.npnct28.log S.npnct28.log 4.463511e-02
## H.P.daily.clip.report H.P.daily.clip.report 3.416759e-02
## H.P.today.in.smallbusiness H.P.today.in.smallbusiness 2.839539e-02
## H.P.verbatim.colon H.P.verbatim.colon 1.634958e-02
## S.P.first.draft S.P.first.draft 1.547771e-02
## H.npnct02.log H.npnct02.log 1.526519e-02
## H.P.quandary H.P.quandary 1.236942e-02
## S.npnct20.log S.npnct20.log 1.220902e-02
## S.npnct03.log S.npnct03.log 1.207385e-02
## A.npnct18.log A.npnct18.log 1.043696e-02
## A.T.presid A.T.presid 9.440632e-03
## S.T.presid S.T.presid 9.421346e-03
## S.P.year.colon S.P.year.colon 6.252341e-03
## H.P.on.this.day H.P.on.this.day 5.059061e-03
## H.npnct05.log H.npnct05.log 4.614811e-03
## S.npnct07.log S.npnct07.log 2.450155e-03
## S.P.fashion.week S.P.fashion.week 8.838548e-04
## H.P.s.notebook H.P.s.notebook 0.000000e+00
## .clusterid .clusterid NA
## A.ndgts.log A.ndgts.log NA
## A.npnct01.log A.npnct01.log NA
## A.npnct02.log A.npnct02.log NA
## A.npnct03.log A.npnct03.log NA
## A.npnct04.log A.npnct04.log NA
## A.npnct05.log A.npnct05.log NA
## A.npnct06.log A.npnct06.log NA
## A.npnct07.log A.npnct07.log NA
## A.npnct08.log A.npnct08.log NA
## A.npnct09.log A.npnct09.log NA
## A.npnct10.log A.npnct10.log NA
## A.npnct11.log A.npnct11.log NA
## A.npnct12.log A.npnct12.log NA
## A.npnct14.log A.npnct14.log NA
## A.npnct15.log A.npnct15.log NA
## A.npnct17.log A.npnct17.log NA
## A.npnct20.log A.npnct20.log NA
## A.npnct21.log A.npnct21.log NA
## A.npnct22.log A.npnct22.log NA
## A.npnct23.log A.npnct23.log NA
## A.npnct24.log A.npnct24.log NA
## A.npnct25.log A.npnct25.log NA
## A.npnct26.log A.npnct26.log NA
## A.npnct27.log A.npnct27.log NA
## A.npnct28.log A.npnct28.log NA
## A.npnct29.log A.npnct29.log NA
## A.npnct30.log A.npnct30.log NA
## A.nstopwrds.log A.nstopwrds.log NA
## A.nuppr.log A.nuppr.log NA
## A.nwrds.log A.nwrds.log NA
## A.P.daily.clip.report A.P.daily.clip.report NA
## A.P.fashion.week A.P.fashion.week NA
## A.P.first.draft A.P.first.draft NA
## A.P.http A.P.http NA
## A.P.metropolitan.diary.colon A.P.metropolitan.diary.colon NA
## A.P.year.colon A.P.year.colon NA
## A.ratio.nstopwrds.nwrds A.ratio.nstopwrds.nwrds NA
## A.ratio.sum.TfIdf.nwrds A.ratio.sum.TfIdf.nwrds NA
## A.T.appear A.T.appear NA
## A.T.archiv A.T.archiv NA
## A.T.can A.T.can NA
## A.T.day A.T.day NA
## A.T.diari A.T.diari NA
## A.T.herald A.T.herald NA
## A.T.make A.T.make NA
## A.T.new A.T.new NA
## A.T.obama A.T.obama NA
## A.T.one A.T.one NA
## A.T.said A.T.said NA
## A.T.share A.T.share NA
## A.T.show A.T.show NA
## A.T.take A.T.take NA
## A.T.time A.T.time NA
## A.T.tribun A.T.tribun NA
## H.nchrs.log H.nchrs.log NA
## H.npnct03.log H.npnct03.log NA
## H.npnct04.log H.npnct04.log NA
## H.npnct06.log H.npnct06.log NA
## H.npnct09.log H.npnct09.log NA
## H.npnct10.log H.npnct10.log NA
## H.npnct14.log H.npnct14.log NA
## H.npnct17.log H.npnct17.log NA
## H.npnct18.log H.npnct18.log NA
## H.npnct20.log H.npnct20.log NA
## H.npnct21.log H.npnct21.log NA
## H.npnct22.log H.npnct22.log NA
## H.npnct23.log H.npnct23.log NA
## H.npnct24.log H.npnct24.log NA
## H.npnct25.log H.npnct25.log NA
## H.npnct26.log H.npnct26.log NA
## H.npnct27.log H.npnct27.log NA
## H.npnct29.log H.npnct29.log NA
## H.npnct30.log H.npnct30.log NA
## H.nwrds.unq.log H.nwrds.unq.log NA
## H.P.http H.P.http NA
## H.P.today.in.politic H.P.today.in.politic NA
## H.P.what.we.are H.P.what.we.are NA
## H.P.year.colon H.P.year.colon NA
## H.T.clip H.T.clip NA
## H.T.daili H.T.daili NA
## H.T.fashion H.T.fashion NA
## H.T.first H.T.first NA
## H.T.morn H.T.morn NA
## H.T.today H.T.today NA
## H.T.X2015 H.T.X2015 NA
## Popular Popular NA
## Popular.fctr Popular.fctr NA
## PubDate.last1 PubDate.last1 NA
## PubDate.last10 PubDate.last10 NA
## PubDate.last100 PubDate.last100 NA
## PubDate.month.fctr PubDate.month.fctr NA
## PubDate.POSIX PubDate.POSIX NA
## PubDate.year.fctr PubDate.year.fctr NA
## PubDate.zoo PubDate.zoo NA
## S.nchrs.log S.nchrs.log NA
## S.npnct02.log S.npnct02.log NA
## S.npnct05.log S.npnct05.log NA
## S.npnct09.log S.npnct09.log NA
## S.npnct10.log S.npnct10.log NA
## S.npnct13.log S.npnct13.log NA
## S.npnct16.log S.npnct16.log NA
## S.npnct17.log S.npnct17.log NA
## S.npnct18.log S.npnct18.log NA
## S.npnct19.log S.npnct19.log NA
## S.npnct21.log S.npnct21.log NA
## S.npnct22.log S.npnct22.log NA
## S.npnct23.log S.npnct23.log NA
## S.npnct24.log S.npnct24.log NA
## S.npnct25.log S.npnct25.log NA
## S.npnct26.log S.npnct26.log NA
## S.npnct27.log S.npnct27.log NA
## S.npnct29.log S.npnct29.log NA
## S.npnct30.log S.npnct30.log NA
## S.nwrds.unq.log S.nwrds.unq.log NA
## S.P.daily.clip.report S.P.daily.clip.report NA
## S.P.http S.P.http NA
## S.sum.TfIdf S.sum.TfIdf NA
## S.T.articl S.T.articl NA
## S.T.compani S.T.compani NA
## S.T.fashion S.T.fashion NA
## S.T.first S.T.first NA
## S.T.intern S.T.intern NA
## S.T.newyork S.T.newyork NA
## S.T.newyorktim S.T.newyorktim NA
## S.T.photo S.T.photo NA
## S.T.report S.T.report NA
## S.T.senat S.T.senat NA
## S.T.week S.T.week NA
## S.T.will S.T.will NA
## S.T.word S.T.word NA
## S.T.year S.T.year NA
## UniqueID UniqueID NA
## WordCount WordCount NA
## cor.y exclude.as.feat cor.y.abs
## WordCount.log 2.656836e-01 FALSE 2.656836e-01
## H.P.readers.respond 4.432886e-02 FALSE 4.432886e-02
## myCategory.fctr 1.234541e-02 FALSE 1.234541e-02
## H.npnct19.log 1.283641e-01 FALSE 1.283641e-01
## H.npnct15.log -8.273237e-02 FALSE 8.273237e-02
## .clusterid.fctr 1.813987e-01 FALSE 1.813987e-01
## A.npnct13.log -4.999563e-02 FALSE 4.999563e-02
## A.npnct19.log 5.482747e-02 FALSE 5.482747e-02
## S.nuppr.log -2.718459e-01 FALSE 2.718459e-01
## S.T.diari -6.229931e-02 FALSE 6.229931e-02
## H.T.word -1.382927e-02 FALSE 1.382927e-02
## H.npnct08.log 5.375262e-02 FALSE 5.375262e-02
## H.T.read -3.467043e-02 FALSE 3.467043e-02
## H.ndgts.log -1.196633e-01 FALSE 1.196633e-01
## S.P.metropolitan.diary.colon -2.841404e-02 FALSE 2.841404e-02
## S.ratio.sum.TfIdf.nwrds 2.622549e-01 FALSE 2.622549e-01
## A.T.newyork -4.686921e-02 FALSE 4.686921e-02
## H.nuppr.log -1.278085e-01 FALSE 1.278085e-01
## S.T.make 4.118050e-02 FALSE 4.118050e-02
## PubDate.wkday.fctr -3.980129e-02 FALSE 3.980129e-02
## H.nstopwrds.log -8.657067e-02 FALSE 8.657067e-02
## H.ratio.nstopwrds.nwrds 4.024406e-02 FALSE 4.024406e-02
## H.npnct11.log 1.333613e-02 FALSE 1.333613e-02
## S.T.can 3.005998e-02 FALSE 3.005998e-02
## H.P.no.comment.colon 6.074669e-02 FALSE 6.074669e-02
## H.P.friday.night.music -9.653967e-03 FALSE 9.653967e-03
## A.T.newyorktim -4.984782e-02 FALSE 4.984782e-02
## S.npnct04.log -6.294642e-02 FALSE 6.294642e-02
## H.T.newyork -5.564999e-02 FALSE 5.564999e-02
## S.T.share -5.105597e-02 FALSE 5.105597e-02
## S.npnct08.log -3.372706e-03 FALSE 3.372706e-03
## H.sum.TfIdf 1.520414e-01 FALSE 1.520414e-01
## H.P.recap.colon 9.008096e-02 FALSE 9.008096e-02
## S.T.one 1.050293e-02 FALSE 1.050293e-02
## H.npnct07.log -1.201741e-02 FALSE 1.201741e-02
## PubDate.last10.log 4.931702e-02 FALSE 4.931702e-02
## H.T.report -6.238114e-02 FALSE 6.238114e-02
## A.nwrds.unq.log -2.460117e-01 FALSE 2.460117e-01
## A.T.report -4.774593e-02 FALSE 4.774593e-02
## PubDate.hour.fctr 1.354368e-01 FALSE 1.354368e-01
## A.T.articl -5.470831e-02 FALSE 5.470831e-02
## A.sum.TfIdf 1.478461e-01 FALSE 1.478461e-01
## S.nstopwrds.log -1.148150e-01 FALSE 1.148150e-01
## PubDate.minute.fctr -3.407385e-02 FALSE 3.407385e-02
## H.T.polit -3.058564e-02 FALSE 3.058564e-02
## S.ratio.nstopwrds.nwrds 1.206896e-01 FALSE 1.206896e-01
## A.T.intern -6.953025e-02 FALSE 6.953025e-02
## S.T.time -2.416246e-02 FALSE 2.416246e-02
## H.npnct12.log -1.305305e-02 FALSE 1.305305e-02
## S.T.take -2.264447e-02 FALSE 2.264447e-02
## H.T.art -3.291486e-02 FALSE 3.291486e-02
## H.npnct13.log -2.524770e-02 FALSE 2.524770e-02
## PubDate.second.fctr -1.187946e-02 FALSE 1.187946e-02
## H.T.week -6.827601e-02 FALSE 6.827601e-02
## H.T.get 3.300192e-02 FALSE 3.300192e-02
## S.npnct01.log 3.093101e-02 FALSE 3.093101e-02
## A.T.will -3.884318e-02 FALSE 3.884318e-02
## S.T.show -4.182920e-02 FALSE 4.182920e-02
## H.T.new -4.111696e-02 FALSE 4.111696e-02
## .rnorm -8.244230e-03 FALSE 8.244230e-03
## H.ratio.sum.TfIdf.nwrds 2.254527e-01 FALSE 2.254527e-01
## S.ndgts.log -1.242046e-01 FALSE 1.242046e-01
## H.T.say -9.763205e-03 FALSE 9.763205e-03
## A.T.first -4.433630e-02 FALSE 4.433630e-02
## A.T.photo -6.873838e-02 FALSE 6.873838e-02
## H.T.china -3.283653e-02 FALSE 3.283653e-02
## H.npnct01.log 2.271577e-02 FALSE 2.271577e-02
## H.T.make 1.349595e-02 FALSE 1.349595e-02
## A.T.senat -4.139980e-02 FALSE 4.139980e-02
## S.T.said 1.863436e-02 FALSE 1.863436e-02
## S.T.day -4.262213e-02 FALSE 4.262213e-02
## H.npnct28.log -8.917338e-02 FALSE 8.917338e-02
## H.T.news -4.436368e-02 FALSE 4.436368e-02
## H.npnct16.log 3.039622e-02 FALSE 3.039622e-02
## H.T.take -1.263270e-03 FALSE 1.263270e-03
## S.npnct12.log -3.638891e-02 FALSE 3.638891e-02
## H.T.busi -4.899819e-02 FALSE 4.899819e-02
## A.T.compani -4.774812e-02 FALSE 4.774812e-02
## S.npnct11.log -9.158156e-02 FALSE 9.158156e-02
## H.T.day -6.044381e-02 FALSE 6.044381e-02
## A.T.word -4.821561e-02 FALSE 4.821561e-02
## H.P.facts.figures 5.410097e-02 FALSE 5.410097e-02
## H.T.X2014 -4.523858e-02 FALSE 4.523858e-02
## PubDate.last1.log 4.635751e-02 FALSE 4.635751e-02
## S.T.obama -1.914281e-02 FALSE 1.914281e-02
## PubDate.date.fctr -1.164756e-02 FALSE 1.164756e-02
## H.T.big -1.438162e-02 FALSE 1.438162e-02
## S.npnct14.log -2.121844e-02 FALSE 2.121844e-02
## A.npnct16.log -1.587454e-03 FALSE 1.587454e-03
## S.npnct06.log -2.389145e-02 FALSE 2.389145e-02
## S.T.appear -3.941362e-02 FALSE 3.941362e-02
## PubDate.last100.log -7.663322e-03 FALSE 7.663322e-03
## PubDate.wkend 1.067288e-01 FALSE 1.067288e-01
## H.T.ebola 2.682920e-02 FALSE 2.682920e-02
## H.nwrds.log -1.573431e-01 FALSE 1.573431e-01
## H.T.obama -9.878461e-03 FALSE 9.878461e-03
## A.T.year -3.741571e-02 FALSE 3.741571e-02
## A.nchrs.log -2.245488e-01 FALSE 2.245488e-01
## H.T.test -2.117852e-02 FALSE 2.117852e-02
## A.T.week -8.542792e-02 FALSE 8.542792e-02
## H.T.pictur -4.003882e-02 FALSE 4.003882e-02
## S.nwrds.log -1.978341e-01 FALSE 1.978341e-01
## H.T.newyorktim -2.514415e-02 FALSE 2.514415e-02
## S.npnct15.log -6.770952e-02 FALSE 6.770952e-02
## H.T.bank -1.037439e-02 FALSE 1.037439e-02
## H.T.billion -2.776561e-02 FALSE 2.776561e-02
## S.T.new -2.592872e-02 FALSE 2.592872e-02
## A.T.fashion -8.416793e-02 FALSE 8.416793e-02
## H.P.fashion.week -7.632046e-02 FALSE 7.632046e-02
## S.T.archiv -7.202808e-02 FALSE 7.202808e-02
## S.T.herald -6.752419e-02 FALSE 6.752419e-02
## H.T.springsumm -5.943248e-02 FALSE 5.943248e-02
## S.T.tribun -7.013418e-02 FALSE 7.013418e-02
## H.T.deal -2.556237e-02 FALSE 2.556237e-02
## H.P.first.draft -4.316253e-02 FALSE 4.316253e-02
## S.npnct28.log -4.370037e-02 FALSE 4.370037e-02
## H.P.daily.clip.report -4.388279e-02 FALSE 4.388279e-02
## H.P.today.in.smallbusiness -4.243051e-02 FALSE 4.243051e-02
## H.P.verbatim.colon -3.194363e-02 FALSE 3.194363e-02
## S.P.first.draft -2.150663e-02 FALSE 2.150663e-02
## H.npnct02.log -2.001851e-02 FALSE 2.001851e-02
## H.P.quandary 8.734922e-02 FALSE 8.734922e-02
## S.npnct20.log -1.923169e-02 FALSE 1.923169e-02
## S.npnct03.log -1.240734e-02 FALSE 1.240734e-02
## A.npnct18.log -1.271661e-02 FALSE 1.271661e-02
## A.T.presid -2.090565e-03 FALSE 2.090565e-03
## S.T.presid -2.381159e-03 FALSE 2.381159e-03
## S.P.year.colon -1.755336e-02 FALSE 1.755336e-02
## H.P.on.this.day -2.150663e-02 FALSE 2.150663e-02
## H.npnct05.log -9.653967e-03 FALSE 9.653967e-03
## S.npnct07.log -1.214357e-02 FALSE 1.214357e-02
## S.P.fashion.week -7.080716e-02 FALSE 7.080716e-02
## H.P.s.notebook 7.755542e-03 FALSE 7.755542e-03
## .clusterid 1.820567e-01 TRUE 1.820567e-01
## A.ndgts.log -1.249484e-01 FALSE 1.249484e-01
## A.npnct01.log 3.093101e-02 FALSE 3.093101e-02
## A.npnct02.log -1.451467e-02 FALSE 1.451467e-02
## A.npnct03.log -1.359260e-02 FALSE 1.359260e-02
## A.npnct04.log -6.294642e-02 FALSE 6.294642e-02
## A.npnct05.log NA FALSE NA
## A.npnct06.log -2.389145e-02 FALSE 2.389145e-02
## A.npnct07.log -1.214357e-02 FALSE 1.214357e-02
## A.npnct08.log -4.193476e-03 FALSE 4.193476e-03
## A.npnct09.log NA FALSE NA
## A.npnct10.log -5.547032e-03 FALSE 5.547032e-03
## A.npnct11.log -9.183870e-02 FALSE 9.183870e-02
## A.npnct12.log -3.760012e-02 FALSE 3.760012e-02
## A.npnct14.log -2.407715e-02 FALSE 2.407715e-02
## A.npnct15.log -6.893301e-02 FALSE 6.893301e-02
## A.npnct17.log -1.457558e-02 FALSE 1.457558e-02
## A.npnct20.log -1.923169e-02 FALSE 1.923169e-02
## A.npnct21.log 1.537569e-02 FALSE 1.537569e-02
## A.npnct22.log NA FALSE NA
## A.npnct23.log 1.537569e-02 FALSE 1.537569e-02
## A.npnct24.log -9.890046e-19 FALSE 9.890046e-19
## A.npnct25.log -5.547032e-03 FALSE 5.547032e-03
## A.npnct26.log NA FALSE NA
## A.npnct27.log NA FALSE NA
## A.npnct28.log -4.373349e-02 FALSE 4.373349e-02
## A.npnct29.log NA FALSE NA
## A.npnct30.log NA FALSE NA
## A.nstopwrds.log -1.153879e-01 FALSE 1.153879e-01
## A.nuppr.log -2.720962e-01 FALSE 2.720962e-01
## A.nwrds.log -1.978712e-01 FALSE 1.978712e-01
## A.P.daily.clip.report -4.388279e-02 FALSE 4.388279e-02
## A.P.fashion.week -7.080716e-02 FALSE 7.080716e-02
## A.P.first.draft -2.150663e-02 FALSE 2.150663e-02
## A.P.http -1.294748e-02 FALSE 1.294748e-02
## A.P.metropolitan.diary.colon -2.841404e-02 FALSE 2.841404e-02
## A.P.year.colon -1.755336e-02 FALSE 1.755336e-02
## A.ratio.nstopwrds.nwrds 1.213545e-01 FALSE 1.213545e-01
## A.ratio.sum.TfIdf.nwrds 2.623865e-01 FALSE 2.623865e-01
## A.T.appear -3.941362e-02 FALSE 3.941362e-02
## A.T.archiv -7.202808e-02 FALSE 7.202808e-02
## A.T.can 3.083389e-02 FALSE 3.083389e-02
## A.T.day -4.270831e-02 FALSE 4.270831e-02
## A.T.diari -6.229931e-02 FALSE 6.229931e-02
## A.T.herald -6.752419e-02 FALSE 6.752419e-02
## A.T.make 4.124187e-02 FALSE 4.124187e-02
## A.T.new -2.597887e-02 FALSE 2.597887e-02
## A.T.obama -1.914924e-02 FALSE 1.914924e-02
## A.T.one 1.051414e-02 FALSE 1.051414e-02
## A.T.said 1.876762e-02 FALSE 1.876762e-02
## A.T.share -5.105597e-02 FALSE 5.105597e-02
## A.T.show -4.185292e-02 FALSE 4.185292e-02
## A.T.take -2.271897e-02 FALSE 2.271897e-02
## A.T.time -2.430509e-02 FALSE 2.430509e-02
## A.T.tribun -7.013418e-02 FALSE 7.013418e-02
## H.nchrs.log -1.710624e-01 FALSE 1.710624e-01
## H.npnct03.log 9.533020e-03 FALSE 9.533020e-03
## H.npnct04.log -5.126277e-02 FALSE 5.126277e-02
## H.npnct06.log 3.190718e-02 FALSE 3.190718e-02
## H.npnct09.log NA FALSE NA
## H.npnct10.log -5.547032e-03 FALSE 5.547032e-03
## H.npnct14.log -6.158577e-02 FALSE 6.158577e-02
## H.npnct17.log NA FALSE NA
## H.npnct18.log NA FALSE NA
## H.npnct20.log -5.547032e-03 FALSE 5.547032e-03
## H.npnct21.log NA FALSE NA
## H.npnct22.log NA FALSE NA
## H.npnct23.log NA FALSE NA
## H.npnct24.log -9.890046e-19 FALSE 9.890046e-19
## H.npnct25.log NA FALSE NA
## H.npnct26.log NA FALSE NA
## H.npnct27.log NA FALSE NA
## H.npnct29.log NA FALSE NA
## H.npnct30.log NA FALSE NA
## H.nwrds.unq.log -2.014127e-01 FALSE 2.014127e-01
## H.P.http NA FALSE NA
## H.P.today.in.politic -3.733661e-02 FALSE 3.733661e-02
## H.P.what.we.are -3.775209e-02 FALSE 3.775209e-02
## H.P.year.colon -7.842875e-02 FALSE 7.842875e-02
## H.T.clip -4.388279e-02 FALSE 4.388279e-02
## H.T.daili -6.303731e-02 FALSE 6.303731e-02
## H.T.fashion -7.947505e-02 FALSE 7.947505e-02
## H.T.first -4.472902e-02 FALSE 4.472902e-02
## H.T.morn -4.838380e-02 FALSE 4.838380e-02
## H.T.today -5.833786e-02 FALSE 5.833786e-02
## H.T.X2015 -6.601141e-02 FALSE 6.601141e-02
## Popular 1.000000e+00 TRUE 1.000000e+00
## Popular.fctr NA TRUE NA
## PubDate.last1 3.592267e-02 TRUE 3.592267e-02
## PubDate.last10 5.398093e-02 TRUE 5.398093e-02
## PubDate.last100 3.989229e-02 TRUE 3.989229e-02
## PubDate.month.fctr 1.914874e-02 TRUE 1.914874e-02
## PubDate.POSIX 1.568326e-02 TRUE 1.568326e-02
## PubDate.year.fctr NA FALSE NA
## PubDate.zoo 1.568326e-02 TRUE 1.568326e-02
## S.nchrs.log -2.246930e-01 FALSE 2.246930e-01
## S.npnct02.log -5.547032e-03 FALSE 5.547032e-03
## S.npnct05.log NA FALSE NA
## S.npnct09.log NA FALSE NA
## S.npnct10.log -5.547032e-03 FALSE 5.547032e-03
## S.npnct13.log -5.332519e-02 FALSE 5.332519e-02
## S.npnct16.log -1.587454e-03 FALSE 1.587454e-03
## S.npnct17.log NA FALSE NA
## S.npnct18.log NA FALSE NA
## S.npnct19.log 5.503894e-02 FALSE 5.503894e-02
## S.npnct21.log 2.760321e-02 FALSE 2.760321e-02
## S.npnct22.log NA FALSE NA
## S.npnct23.log 2.760321e-02 FALSE 2.760321e-02
## S.npnct24.log -9.890046e-19 FALSE 9.890046e-19
## S.npnct25.log NA FALSE NA
## S.npnct26.log NA FALSE NA
## S.npnct27.log NA FALSE NA
## S.npnct29.log NA FALSE NA
## S.npnct30.log NA FALSE NA
## S.nwrds.unq.log -2.461670e-01 FALSE 2.461670e-01
## S.P.daily.clip.report -4.388279e-02 FALSE 4.388279e-02
## S.P.http NA FALSE NA
## S.sum.TfIdf 1.484963e-01 FALSE 1.484963e-01
## S.T.articl -5.471737e-02 FALSE 5.471737e-02
## S.T.compani -4.787994e-02 FALSE 4.787994e-02
## S.T.fashion -8.417159e-02 FALSE 8.417159e-02
## S.T.first -4.447317e-02 FALSE 4.447317e-02
## S.T.intern -6.956906e-02 FALSE 6.956906e-02
## S.T.newyork -4.694998e-02 FALSE 4.694998e-02
## S.T.newyorktim -4.985328e-02 FALSE 4.985328e-02
## S.T.photo -6.874283e-02 FALSE 6.874283e-02
## S.T.report -4.779877e-02 FALSE 4.779877e-02
## S.T.senat -4.143422e-02 FALSE 4.143422e-02
## S.T.week -8.552704e-02 FALSE 8.552704e-02
## S.T.will -3.888838e-02 FALSE 3.888838e-02
## S.T.word -4.822452e-02 FALSE 4.822452e-02
## S.T.year -3.756011e-02 FALSE 3.756011e-02
## UniqueID 1.182492e-02 TRUE 1.182492e-02
## WordCount 2.575265e-01 TRUE 2.575265e-01
## cor.high.X freqRatio
## WordCount.log <NA> 1.300000
## H.P.readers.respond <NA> 342.789474
## myCategory.fctr <NA> 1.337185
## H.npnct19.log <NA> 14.995098
## H.npnct15.log <NA> 3.914910
## .clusterid.fctr <NA> 16.410959
## A.npnct13.log <NA> 4.603330
## A.npnct19.log <NA> 12.798715
## S.nuppr.log <NA> 1.152620
## S.T.diari <NA> 71.528090
## H.T.word <NA> 104.096774
## H.npnct08.log <NA> 111.620690
## H.T.read <NA> 179.388889
## H.ndgts.log <NA> 13.616137
## S.P.metropolitan.diary.colon <NA> 99.492308
## S.ratio.sum.TfIdf.nwrds <NA> 2.583333
## A.T.newyork <NA> 149.547619
## H.nuppr.log <NA> 1.033930
## S.T.make <NA> 273.782609
## PubDate.wkday.fctr <NA> 1.003268
## H.nstopwrds.log <NA> 1.370729
## H.ratio.nstopwrds.nwrds <NA> 1.141631
## H.npnct11.log <NA> 4.937442
## S.T.can <NA> 261.666667
## H.P.no.comment.colon <NA> 724.777778
## H.P.friday.night.music <NA> 543.333333
## A.T.newyorktim <NA> 84.540541
## S.npnct04.log <NA> 28.536364
## H.T.newyork <NA> 112.517857
## S.T.share <NA> 234.629630
## S.npnct08.log <NA> 175.486486
## H.sum.TfIdf <NA> 1.127273
## H.P.recap.colon <NA> 93.666667
## S.T.one <NA> 214.965517
## H.npnct07.log <NA> 5.437234
## PubDate.last10.log <NA> 1.666667
## H.T.report <NA> 102.000000
## A.nwrds.unq.log <NA> 1.054206
## A.T.report <NA> 80.371795
## PubDate.hour.fctr <NA> 1.835040
## A.T.articl <NA> 85.500000
## A.sum.TfIdf <NA> 2.583333
## S.nstopwrds.log <NA> 1.097879
## PubDate.minute.fctr <NA> 1.483365
## H.T.polit <NA> 128.780000
## S.ratio.nstopwrds.nwrds <NA> 1.908517
## A.T.intern <NA> 140.400000
## S.T.time <NA> 217.862069
## H.npnct12.log <NA> 13.126638
## S.T.take <NA> 274.608696
## H.T.art <NA> 293.363636
## H.npnct13.log <NA> 22.802326
## PubDate.second.fctr <NA> 1.018204
## H.T.week <NA> 71.352273
## H.T.get <NA> 430.866667
## S.npnct01.log <NA> 309.952381
## A.T.will <NA> 121.734694
## S.T.show <NA> 274.608696
## H.T.new <NA> 123.333333
## .rnorm <NA> 2.000000
## H.ratio.sum.TfIdf.nwrds <NA> 1.148148
## S.ndgts.log <NA> 10.511247
## H.T.say <NA> 247.461538
## A.T.first <NA> 225.250000
## A.T.photo <NA> 70.400000
## H.T.china <NA> 238.407407
## H.npnct01.log <NA> 282.913043
## H.T.make <NA> 322.200000
## A.T.senat <NA> 372.294118
## S.T.said <NA> 202.516129
## S.T.day <NA> 89.528571
## H.npnct28.log <NA> 24.123077
## H.T.news <NA> 322.000000
## H.npnct16.log <NA> 96.104478
## H.T.take <NA> 322.250000
## S.npnct12.log <NA> 5.706263
## H.T.busi <NA> 229.428571
## A.T.compani <NA> 137.111111
## S.npnct11.log <NA> 1.660473
## H.T.day <NA> 86.547945
## A.T.word <NA> 133.125000
## H.P.facts.figures <NA> 1087.666667
## H.T.X2014 <NA> 110.879310
## PubDate.last1.log <NA> 1.142857
## S.T.obama <NA> 398.625000
## PubDate.date.fctr <NA> 1.021394
## H.T.big <NA> 403.562500
## S.npnct14.log <NA> 203.062500
## A.npnct16.log <NA> 434.133333
## S.npnct06.log <NA> 115.642857
## S.T.appear <NA> 228.821429
## PubDate.last100.log <NA> 25.000000
## PubDate.wkend <NA> 9.095827
## H.T.ebola <NA> 293.000000
## H.nwrds.log <NA> 1.104308
## H.T.obama <NA> 229.750000
## A.T.year <NA> 160.815789
## A.nchrs.log <NA> 1.328571
## H.T.test <NA> 306.666667
## A.T.week <NA> 56.560748
## H.T.pictur <NA> 99.230769
## S.nwrds.log <NA> 1.049342
## H.T.newyorktim <NA> 433.266667
## S.npnct15.log <NA> 13.647191
## H.T.bank <NA> 214.300000
## H.T.billion <NA> 214.533333
## S.T.new <NA> 114.423077
## A.T.fashion <NA> 59.245283
## H.P.fashion.week <NA> 34.500000
## S.T.archiv <NA> 144.545455
## S.T.herald <NA> 144.750000
## H.T.springsumm <NA> 106.966667
## S.T.tribun <NA> 144.750000
## H.T.deal <NA> 230.428571
## H.P.first.draft <NA> 107.866667
## S.npnct28.log <NA> 134.791667
## H.P.daily.clip.report <NA> 104.354839
## H.P.today.in.smallbusiness <NA> 111.620690
## H.P.verbatim.colon <NA> 196.939394
## S.P.first.draft <NA> 434.466667
## H.npnct02.log <NA> 501.461538
## H.P.quandary <NA> 652.200000
## S.npnct20.log <NA> 543.333333
## S.npnct03.log <NA> 1305.400000
## A.npnct18.log <NA> 1631.500000
## A.T.presid <NA> 232.740741
## S.T.presid <NA> 232.740741
## S.P.year.colon <NA> 652.200000
## H.P.on.this.day <NA> 434.466667
## H.npnct05.log <NA> 543.333333
## S.npnct07.log <NA> 1631.750000
## S.P.fashion.week <NA> 40.081761
## H.P.s.notebook <NA> 815.500000
## .clusterid <NA> 16.410959
## A.ndgts.log S.ndgts.log 10.501022
## A.npnct01.log S.npnct01.log 309.952381
## A.npnct02.log A.P.http 1087.500000
## A.npnct03.log S.npnct03.log 1087.666667
## A.npnct04.log S.npnct04.log 28.536364
## A.npnct05.log <NA> 0.000000
## A.npnct06.log S.npnct06.log 115.642857
## A.npnct07.log S.npnct07.log 1631.750000
## A.npnct08.log <NA> 170.842105
## A.npnct09.log <NA> 0.000000
## A.npnct10.log <NA> 6531.000000
## A.npnct11.log S.npnct11.log 1.660473
## A.npnct12.log S.npnct12.log 5.715368
## A.npnct14.log A.npnct17.log 196.696970
## A.npnct15.log S.npnct15.log 13.482222
## A.npnct17.log A.npnct02.log 1087.500000
## A.npnct20.log S.npnct20.log 543.333333
## A.npnct21.log A.npnct23.log 3264.500000
## A.npnct22.log <NA> 0.000000
## A.npnct23.log <NA> 3264.500000
## A.npnct24.log <NA> 0.000000
## A.npnct25.log <NA> 6531.000000
## A.npnct26.log <NA> 0.000000
## A.npnct27.log <NA> 0.000000
## A.npnct28.log S.npnct28.log 126.862745
## A.npnct29.log <NA> 0.000000
## A.npnct30.log <NA> 0.000000
## A.nstopwrds.log S.nstopwrds.log 1.096091
## A.nuppr.log S.nuppr.log 1.151308
## A.nwrds.log S.nwrds.log 1.052805
## A.P.daily.clip.report H.T.clip 104.354839
## A.P.fashion.week S.P.fashion.week 40.081761
## A.P.first.draft S.P.first.draft 434.466667
## A.P.http A.npnct18.log 1305.200000
## A.P.metropolitan.diary.colon S.P.metropolitan.diary.colon 99.492308
## A.P.year.colon S.P.year.colon 652.200000
## A.ratio.nstopwrds.nwrds S.ratio.nstopwrds.nwrds 1.915094
## A.ratio.sum.TfIdf.nwrds A.nstopwrds.log 2.583333
## A.T.appear H.T.word 228.821429
## A.T.archiv S.T.intern 144.545455
## A.T.can S.T.can 261.666667
## A.T.day S.T.day 89.514286
## A.T.diari S.T.diari 71.528090
## A.T.herald S.T.herald 144.750000
## A.T.make S.T.make 273.782609
## A.T.new S.T.new 114.403846
## A.T.obama S.T.obama 398.625000
## A.T.one S.T.one 214.931034
## A.T.said S.T.said 202.516129
## A.T.share S.T.share 234.629630
## A.T.show S.T.show 263.166667
## A.T.take S.T.take 274.565217
## A.T.time S.T.time 217.827586
## A.T.tribun A.T.herald 144.750000
## H.nchrs.log H.nwrds.log 1.023810
## H.npnct03.log <NA> 2176.333333
## H.npnct04.log H.T.billion 38.325301
## H.npnct06.log H.npnct16.log 68.935484
## H.npnct09.log <NA> 0.000000
## H.npnct10.log <NA> 6531.000000
## H.npnct14.log H.T.springsumm 52.983471
## H.npnct17.log <NA> 0.000000
## H.npnct18.log <NA> 0.000000
## H.npnct20.log <NA> 6531.000000
## H.npnct21.log <NA> 0.000000
## H.npnct22.log <NA> 0.000000
## H.npnct23.log <NA> 0.000000
## H.npnct24.log <NA> 0.000000
## H.npnct25.log <NA> 0.000000
## H.npnct26.log <NA> 0.000000
## H.npnct27.log <NA> 0.000000
## H.npnct29.log <NA> 0.000000
## H.npnct30.log <NA> 0.000000
## H.nwrds.unq.log H.nuppr.log 1.000000
## H.P.http <NA> 0.000000
## H.P.today.in.politic H.T.polit 144.155556
## H.P.what.we.are H.T.read 141.000000
## H.P.year.colon A.T.archiv 32.670103
## H.T.clip <NA> 104.354839
## H.T.daili H.T.report 102.903226
## H.T.fashion H.P.fashion.week 76.926829
## H.T.first H.P.first.draft 194.727273
## H.T.morn A.npnct28.log 165.205128
## H.T.today H.P.today.in.politic 138.239130
## H.T.X2015 A.T.diari 96.833333
## Popular <NA> 4.976212
## Popular.fctr <NA> NA
## PubDate.last1 <NA> 1.142857
## PubDate.last10 <NA> 1.666667
## PubDate.last100 <NA> 25.000000
## PubDate.month.fctr <NA> 1.017514
## PubDate.POSIX <NA> 1.000000
## PubDate.year.fctr <NA> 0.000000
## PubDate.zoo <NA> 1.000000
## S.nchrs.log A.nwrds.log 1.328571
## S.npnct02.log <NA> 6531.000000
## S.npnct05.log <NA> 0.000000
## S.npnct09.log <NA> 0.000000
## S.npnct10.log <NA> 6531.000000
## S.npnct13.log A.npnct13.log 4.672000
## S.npnct16.log <NA> 434.133333
## S.npnct17.log <NA> 0.000000
## S.npnct18.log <NA> 0.000000
## S.npnct19.log A.npnct19.log 12.862366
## S.npnct21.log A.npnct21.log 6531.000000
## S.npnct22.log <NA> 0.000000
## S.npnct23.log <NA> 6531.000000
## S.npnct24.log <NA> 0.000000
## S.npnct25.log <NA> 0.000000
## S.npnct26.log <NA> 0.000000
## S.npnct27.log <NA> 0.000000
## S.npnct29.log <NA> 0.000000
## S.npnct30.log <NA> 0.000000
## S.nwrds.unq.log S.nchrs.log 1.054206
## S.P.daily.clip.report <NA> 104.354839
## S.P.http <NA> 0.000000
## S.sum.TfIdf A.sum.TfIdf 2.583333
## S.T.articl A.T.articl 85.500000
## S.T.compani A.T.compani 137.111111
## S.T.fashion H.T.X2015 59.245283
## S.T.first A.T.first 225.250000
## S.T.intern A.T.intern 140.400000
## S.T.newyork A.T.newyork 149.547619
## S.T.newyorktim A.T.newyorktim 84.540541
## S.T.photo A.T.photo 70.400000
## S.T.report A.T.report 80.371795
## S.T.senat A.T.senat 372.352941
## S.T.week A.T.week 56.560748
## S.T.will A.T.will 119.340000
## S.T.word A.T.word 133.125000
## S.T.year A.T.year 160.815789
## UniqueID <NA> 1.000000
## WordCount <NA> 2.315789
## percentUnique zeroVar nzv myNearZV
## WordCount.log 24.14268218 FALSE FALSE FALSE
## H.P.readers.respond 0.03061849 FALSE TRUE FALSE
## myCategory.fctr 0.30618494 FALSE FALSE FALSE
## H.npnct19.log 0.06123699 FALSE FALSE FALSE
## H.npnct15.log 0.04592774 FALSE FALSE FALSE
## .clusterid.fctr 1.17881200 FALSE FALSE FALSE
## A.npnct13.log 0.16840171 FALSE FALSE FALSE
## A.npnct19.log 0.07654623 FALSE FALSE FALSE
## S.nuppr.log 0.33680343 FALSE FALSE FALSE
## S.T.diari 0.18371096 FALSE TRUE FALSE
## H.T.word 0.13778322 FALSE TRUE FALSE
## H.npnct08.log 0.03061849 FALSE TRUE FALSE
## H.T.read 0.16840171 FALSE TRUE FALSE
## H.ndgts.log 0.18371096 FALSE FALSE FALSE
## S.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## S.ratio.sum.TfIdf.nwrds 94.45805266 FALSE FALSE FALSE
## A.T.newyork 0.44396816 FALSE TRUE FALSE
## H.nuppr.log 0.29087569 FALSE FALSE FALSE
## S.T.make 0.44396816 FALSE TRUE FALSE
## PubDate.wkday.fctr 0.10716473 FALSE FALSE FALSE
## H.nstopwrds.log 0.12247397 FALSE FALSE FALSE
## H.ratio.nstopwrds.nwrds 0.96448255 FALSE FALSE FALSE
## H.npnct11.log 0.07654623 FALSE FALSE FALSE
## S.T.can 0.41334966 FALSE TRUE FALSE
## H.P.no.comment.colon 0.03061849 FALSE TRUE FALSE
## H.P.friday.night.music 0.03061849 FALSE TRUE FALSE
## A.T.newyorktim 0.32149418 FALSE TRUE FALSE
## S.npnct04.log 0.07654623 FALSE TRUE FALSE
## H.T.newyork 0.15309247 FALSE TRUE FALSE
## S.T.share 0.38273117 FALSE TRUE FALSE
## S.npnct08.log 0.06123699 FALSE TRUE FALSE
## H.sum.TfIdf 84.44580527 FALSE FALSE FALSE
## H.P.recap.colon 0.03061849 FALSE TRUE FALSE
## S.T.one 0.44396816 FALSE TRUE FALSE
## H.npnct07.log 0.12247397 FALSE FALSE FALSE
## PubDate.last10.log 79.05695040 FALSE FALSE FALSE
## H.T.report 0.16840171 FALSE TRUE FALSE
## A.nwrds.unq.log 0.55113288 FALSE FALSE FALSE
## A.T.report 0.38273117 FALSE TRUE FALSE
## PubDate.hour.fctr 0.04592774 FALSE FALSE FALSE
## A.T.articl 0.29087569 FALSE TRUE FALSE
## A.sum.TfIdf 94.27434170 FALSE FALSE FALSE
## S.nstopwrds.log 0.38273117 FALSE FALSE FALSE
## PubDate.minute.fctr 0.06123699 FALSE FALSE FALSE
## H.T.polit 0.13778322 FALSE TRUE FALSE
## S.ratio.nstopwrds.nwrds 3.75076546 FALSE FALSE FALSE
## A.T.intern 0.32149418 FALSE TRUE FALSE
## S.T.time 0.42865891 FALSE TRUE FALSE
## H.npnct12.log 0.09185548 FALSE FALSE FALSE
## S.T.take 0.38273117 FALSE TRUE FALSE
## H.T.art 0.19902021 FALSE TRUE FALSE
## H.npnct13.log 0.12247397 FALSE TRUE FALSE
## PubDate.second.fctr 0.06123699 FALSE FALSE FALSE
## H.T.week 0.16840171 FALSE TRUE FALSE
## H.T.get 0.18371096 FALSE TRUE FALSE
## S.npnct01.log 0.06123699 FALSE TRUE FALSE
## A.T.will 0.59706062 FALSE TRUE FALSE
## S.T.show 0.38273117 FALSE TRUE FALSE
## H.T.new 0.19902021 FALSE TRUE FALSE
## .rnorm 99.98469075 FALSE FALSE FALSE
## H.ratio.sum.TfIdf.nwrds 90.46233925 FALSE FALSE FALSE
## S.ndgts.log 0.26025720 FALSE FALSE FALSE
## H.T.say 0.16840171 FALSE TRUE FALSE
## A.T.first 0.42865891 FALSE TRUE FALSE
## A.T.photo 0.27556644 FALSE TRUE FALSE
## H.T.china 0.16840171 FALSE TRUE FALSE
## H.npnct01.log 0.04592774 FALSE TRUE FALSE
## H.T.make 0.13778322 FALSE TRUE FALSE
## A.T.senat 0.50520514 FALSE TRUE FALSE
## S.T.said 0.38273117 FALSE TRUE FALSE
## S.T.day 0.41334966 FALSE TRUE FALSE
## H.npnct28.log 0.03061849 FALSE TRUE FALSE
## H.T.news 0.15309247 FALSE TRUE FALSE
## H.npnct16.log 0.06123699 FALSE TRUE FALSE
## H.T.take 0.15309247 FALSE TRUE FALSE
## S.npnct12.log 0.09185548 FALSE FALSE FALSE
## H.T.busi 0.18371096 FALSE TRUE FALSE
## A.T.compani 0.48989590 FALSE TRUE FALSE
## S.npnct11.log 0.13778322 FALSE FALSE FALSE
## H.T.day 0.18371096 FALSE TRUE FALSE
## A.T.word 0.30618494 FALSE TRUE FALSE
## H.P.facts.figures 0.03061849 FALSE TRUE FALSE
## H.T.X2014 0.13778322 FALSE TRUE FALSE
## PubDate.last1.log 36.49724434 FALSE FALSE FALSE
## S.T.obama 0.38273117 FALSE TRUE FALSE
## PubDate.date.fctr 0.07654623 FALSE FALSE FALSE
## H.T.big 0.19902021 FALSE TRUE FALSE
## S.npnct14.log 0.04592774 FALSE TRUE FALSE
## A.npnct16.log 0.04592774 FALSE TRUE FALSE
## S.npnct06.log 0.03061849 FALSE TRUE FALSE
## S.T.appear 0.30618494 FALSE TRUE FALSE
## PubDate.last100.log 92.19228414 FALSE FALSE FALSE
## PubDate.wkend 0.03061849 FALSE FALSE FALSE
## H.T.ebola 0.16840171 FALSE TRUE FALSE
## H.nwrds.log 0.32149418 FALSE FALSE FALSE
## H.T.obama 0.16840171 FALSE TRUE FALSE
## A.T.year 0.48989590 FALSE TRUE FALSE
## A.nchrs.log 4.39375383 FALSE FALSE FALSE
## H.T.test 0.13778322 FALSE TRUE FALSE
## A.T.week 0.47458665 FALSE TRUE FALSE
## H.T.pictur 0.10716473 FALSE TRUE FALSE
## S.nwrds.log 0.73484385 FALSE FALSE FALSE
## H.T.newyorktim 0.12247397 FALSE TRUE FALSE
## S.npnct15.log 0.04592774 FALSE FALSE FALSE
## H.T.bank 0.13778322 FALSE TRUE FALSE
## H.T.billion 0.13778322 FALSE TRUE FALSE
## S.T.new 0.47458665 FALSE TRUE FALSE
## A.T.fashion 0.39804042 FALSE TRUE FALSE
## H.P.fashion.week 0.03061849 FALSE TRUE FALSE
## S.T.archiv 0.24494795 FALSE TRUE FALSE
## S.T.herald 0.24494795 FALSE TRUE FALSE
## H.T.springsumm 0.09185548 FALSE TRUE FALSE
## S.T.tribun 0.24494795 FALSE TRUE FALSE
## H.T.deal 0.13778322 FALSE TRUE FALSE
## H.P.first.draft 0.03061849 FALSE TRUE FALSE
## S.npnct28.log 0.04592774 FALSE TRUE FALSE
## H.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## H.P.today.in.smallbusiness 0.03061849 FALSE TRUE FALSE
## H.P.verbatim.colon 0.03061849 FALSE TRUE FALSE
## S.P.first.draft 0.03061849 FALSE TRUE FALSE
## H.npnct02.log 0.03061849 FALSE TRUE FALSE
## H.P.quandary 0.03061849 FALSE TRUE FALSE
## S.npnct20.log 0.03061849 FALSE TRUE FALSE
## S.npnct03.log 0.03061849 FALSE TRUE FALSE
## A.npnct18.log 0.06123699 FALSE TRUE FALSE
## A.T.presid 0.45927740 FALSE TRUE FALSE
## S.T.presid 0.42865891 FALSE TRUE FALSE
## S.P.year.colon 0.03061849 FALSE TRUE FALSE
## H.P.on.this.day 0.03061849 FALSE TRUE FALSE
## H.npnct05.log 0.03061849 FALSE TRUE FALSE
## S.npnct07.log 0.04592774 FALSE TRUE FALSE
## S.P.fashion.week 0.03061849 FALSE TRUE FALSE
## H.P.s.notebook 0.03061849 FALSE TRUE FALSE
## .clusterid 1.17881200 FALSE FALSE FALSE
## A.ndgts.log 0.29087569 FALSE FALSE FALSE
## A.npnct01.log 0.06123699 FALSE TRUE FALSE
## A.npnct02.log 0.04592774 FALSE TRUE FALSE
## A.npnct03.log 0.03061849 FALSE TRUE FALSE
## A.npnct04.log 0.07654623 FALSE TRUE FALSE
## A.npnct05.log 0.01530925 TRUE TRUE TRUE
## A.npnct06.log 0.03061849 FALSE TRUE FALSE
## A.npnct07.log 0.04592774 FALSE TRUE FALSE
## A.npnct08.log 0.06123699 FALSE TRUE FALSE
## A.npnct09.log 0.01530925 TRUE TRUE TRUE
## A.npnct10.log 0.03061849 FALSE TRUE TRUE
## A.npnct11.log 0.13778322 FALSE FALSE FALSE
## A.npnct12.log 0.12247397 FALSE FALSE FALSE
## A.npnct14.log 0.10716473 FALSE TRUE FALSE
## A.npnct15.log 0.04592774 FALSE FALSE FALSE
## A.npnct17.log 0.04592774 FALSE TRUE FALSE
## A.npnct20.log 0.03061849 FALSE TRUE FALSE
## A.npnct21.log 0.04592774 FALSE TRUE TRUE
## A.npnct22.log 0.01530925 TRUE TRUE TRUE
## A.npnct23.log 0.04592774 FALSE TRUE TRUE
## A.npnct24.log 0.01530925 TRUE TRUE TRUE
## A.npnct25.log 0.03061849 FALSE TRUE TRUE
## A.npnct26.log 0.01530925 TRUE TRUE TRUE
## A.npnct27.log 0.01530925 TRUE TRUE TRUE
## A.npnct28.log 0.04592774 FALSE TRUE FALSE
## A.npnct29.log 0.01530925 TRUE TRUE TRUE
## A.npnct30.log 0.01530925 TRUE TRUE TRUE
## A.nstopwrds.log 0.42865891 FALSE FALSE FALSE
## A.nuppr.log 0.33680343 FALSE FALSE FALSE
## A.nwrds.log 0.93386405 FALSE FALSE FALSE
## A.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## A.P.fashion.week 0.03061849 FALSE TRUE FALSE
## A.P.first.draft 0.03061849 FALSE TRUE FALSE
## A.P.http 0.04592774 FALSE TRUE FALSE
## A.P.metropolitan.diary.colon 0.03061849 FALSE TRUE FALSE
## A.P.year.colon 0.03061849 FALSE TRUE FALSE
## A.ratio.nstopwrds.nwrds 4.10287814 FALSE FALSE FALSE
## A.ratio.sum.TfIdf.nwrds 94.51928965 FALSE FALSE FALSE
## A.T.appear 0.30618494 FALSE TRUE FALSE
## A.T.archiv 0.24494795 FALSE TRUE FALSE
## A.T.can 0.48989590 FALSE TRUE FALSE
## A.T.day 0.44396816 FALSE TRUE FALSE
## A.T.diari 0.18371096 FALSE TRUE FALSE
## A.T.herald 0.24494795 FALSE TRUE FALSE
## A.T.make 0.44396816 FALSE TRUE FALSE
## A.T.new 0.48989590 FALSE TRUE FALSE
## A.T.obama 0.42865891 FALSE TRUE FALSE
## A.T.one 0.48989590 FALSE TRUE FALSE
## A.T.said 0.41334966 FALSE TRUE FALSE
## A.T.share 0.38273117 FALSE TRUE FALSE
## A.T.show 0.39804042 FALSE TRUE FALSE
## A.T.take 0.42865891 FALSE TRUE FALSE
## A.T.time 0.42865891 FALSE TRUE FALSE
## A.T.tribun 0.24494795 FALSE TRUE FALSE
## H.nchrs.log 1.57685242 FALSE FALSE FALSE
## H.npnct03.log 0.03061849 FALSE TRUE TRUE
## H.npnct04.log 0.04592774 FALSE TRUE FALSE
## H.npnct06.log 0.06123699 FALSE TRUE FALSE
## H.npnct09.log 0.01530925 TRUE TRUE TRUE
## H.npnct10.log 0.03061849 FALSE TRUE TRUE
## H.npnct14.log 0.03061849 FALSE TRUE FALSE
## H.npnct17.log 0.01530925 TRUE TRUE TRUE
## H.npnct18.log 0.01530925 TRUE TRUE TRUE
## H.npnct20.log 0.03061849 FALSE TRUE TRUE
## H.npnct21.log 0.01530925 TRUE TRUE TRUE
## H.npnct22.log 0.01530925 TRUE TRUE TRUE
## H.npnct23.log 0.01530925 TRUE TRUE TRUE
## H.npnct24.log 0.01530925 TRUE TRUE TRUE
## H.npnct25.log 0.01530925 TRUE TRUE TRUE
## H.npnct26.log 0.01530925 TRUE TRUE TRUE
## H.npnct27.log 0.01530925 TRUE TRUE TRUE
## H.npnct29.log 0.01530925 TRUE TRUE TRUE
## H.npnct30.log 0.01530925 TRUE TRUE TRUE
## H.nwrds.unq.log 0.21432945 FALSE FALSE FALSE
## H.P.http 0.01530925 TRUE TRUE TRUE
## H.P.today.in.politic 0.03061849 FALSE TRUE FALSE
## H.P.what.we.are 0.03061849 FALSE TRUE FALSE
## H.P.year.colon 0.03061849 FALSE TRUE FALSE
## H.T.clip 0.03061849 FALSE TRUE FALSE
## H.T.daili 0.16840171 FALSE TRUE FALSE
## H.T.fashion 0.19902021 FALSE TRUE FALSE
## H.T.first 0.15309247 FALSE TRUE FALSE
## H.T.morn 0.07654623 FALSE TRUE FALSE
## H.T.today 0.13778322 FALSE TRUE FALSE
## H.T.X2015 0.10716473 FALSE TRUE FALSE
## Popular 0.03061849 FALSE FALSE FALSE
## Popular.fctr NA NA NA NA
## PubDate.last1 36.49724434 FALSE FALSE FALSE
## PubDate.last10 79.05695040 FALSE FALSE FALSE
## PubDate.last100 92.52908757 FALSE FALSE FALSE
## PubDate.month.fctr 0.04592774 FALSE FALSE FALSE
## PubDate.POSIX 99.86221678 FALSE FALSE FALSE
## PubDate.year.fctr 0.01530925 TRUE TRUE TRUE
## PubDate.zoo 99.86221678 FALSE FALSE FALSE
## S.nchrs.log 3.72014697 FALSE FALSE FALSE
## S.npnct02.log 0.03061849 FALSE TRUE TRUE
## S.npnct05.log 0.01530925 TRUE TRUE TRUE
## S.npnct09.log 0.01530925 TRUE TRUE TRUE
## S.npnct10.log 0.03061849 FALSE TRUE TRUE
## S.npnct13.log 0.16840171 FALSE FALSE FALSE
## S.npnct16.log 0.04592774 FALSE TRUE FALSE
## S.npnct17.log 0.01530925 TRUE TRUE TRUE
## S.npnct18.log 0.01530925 TRUE TRUE TRUE
## S.npnct19.log 0.07654623 FALSE FALSE FALSE
## S.npnct21.log 0.03061849 FALSE TRUE TRUE
## S.npnct22.log 0.01530925 TRUE TRUE TRUE
## S.npnct23.log 0.03061849 FALSE TRUE TRUE
## S.npnct24.log 0.01530925 TRUE TRUE TRUE
## S.npnct25.log 0.01530925 TRUE TRUE TRUE
## S.npnct26.log 0.01530925 TRUE TRUE TRUE
## S.npnct27.log 0.01530925 TRUE TRUE TRUE
## S.npnct29.log 0.01530925 TRUE TRUE TRUE
## S.npnct30.log 0.01530925 TRUE TRUE TRUE
## S.nwrds.unq.log 0.44396816 FALSE FALSE FALSE
## S.P.daily.clip.report 0.03061849 FALSE TRUE FALSE
## S.P.http 0.01530925 TRUE TRUE TRUE
## S.sum.TfIdf 94.32026944 FALSE FALSE FALSE
## S.T.articl 0.29087569 FALSE TRUE FALSE
## S.T.compani 0.44396816 FALSE TRUE FALSE
## S.T.fashion 0.38273117 FALSE TRUE FALSE
## S.T.first 0.41334966 FALSE TRUE FALSE
## S.T.intern 0.30618494 FALSE TRUE FALSE
## S.T.newyork 0.41334966 FALSE TRUE FALSE
## S.T.newyorktim 0.33680343 FALSE TRUE FALSE
## S.T.photo 0.29087569 FALSE TRUE FALSE
## S.T.report 0.35211268 FALSE TRUE FALSE
## S.T.senat 0.47458665 FALSE TRUE FALSE
## S.T.week 0.41334966 FALSE TRUE FALSE
## S.T.will 0.55113288 FALSE TRUE FALSE
## S.T.word 0.30618494 FALSE TRUE FALSE
## S.T.year 0.45927740 FALSE TRUE FALSE
## UniqueID 100.00000000 FALSE FALSE FALSE
## WordCount 24.15799143 FALSE FALSE FALSE
## is.cor.y.abs.low rsp_var_raw id_var rsp_var
## WordCount.log FALSE FALSE NA NA
## H.P.readers.respond FALSE FALSE NA NA
## myCategory.fctr FALSE FALSE NA NA
## H.npnct19.log FALSE FALSE NA NA
## H.npnct15.log FALSE FALSE NA NA
## .clusterid.fctr FALSE FALSE NA NA
## A.npnct13.log FALSE FALSE NA NA
## A.npnct19.log FALSE FALSE NA NA
## S.nuppr.log FALSE FALSE NA NA
## S.T.diari FALSE FALSE NA NA
## H.T.word FALSE FALSE NA NA
## H.npnct08.log FALSE FALSE NA NA
## H.T.read FALSE FALSE NA NA
## H.ndgts.log FALSE FALSE NA NA
## S.P.metropolitan.diary.colon FALSE FALSE NA NA
## S.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## A.T.newyork FALSE FALSE NA NA
## H.nuppr.log FALSE FALSE NA NA
## S.T.make FALSE FALSE NA NA
## PubDate.wkday.fctr FALSE FALSE NA NA
## H.nstopwrds.log FALSE FALSE NA NA
## H.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## H.npnct11.log FALSE FALSE NA NA
## S.T.can FALSE FALSE NA NA
## H.P.no.comment.colon FALSE FALSE NA NA
## H.P.friday.night.music FALSE FALSE NA NA
## A.T.newyorktim FALSE FALSE NA NA
## S.npnct04.log FALSE FALSE NA NA
## H.T.newyork FALSE FALSE NA NA
## S.T.share FALSE FALSE NA NA
## S.npnct08.log TRUE FALSE NA NA
## H.sum.TfIdf FALSE FALSE NA NA
## H.P.recap.colon FALSE FALSE NA NA
## S.T.one FALSE FALSE NA NA
## H.npnct07.log FALSE FALSE NA NA
## PubDate.last10.log FALSE FALSE NA NA
## H.T.report FALSE FALSE NA NA
## A.nwrds.unq.log FALSE FALSE NA NA
## A.T.report FALSE FALSE NA NA
## PubDate.hour.fctr FALSE FALSE NA NA
## A.T.articl FALSE FALSE NA NA
## A.sum.TfIdf FALSE FALSE NA NA
## S.nstopwrds.log FALSE FALSE NA NA
## PubDate.minute.fctr FALSE FALSE NA NA
## H.T.polit FALSE FALSE NA NA
## S.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## A.T.intern FALSE FALSE NA NA
## S.T.time FALSE FALSE NA NA
## H.npnct12.log FALSE FALSE NA NA
## S.T.take FALSE FALSE NA NA
## H.T.art FALSE FALSE NA NA
## H.npnct13.log FALSE FALSE NA NA
## PubDate.second.fctr FALSE FALSE NA NA
## H.T.week FALSE FALSE NA NA
## H.T.get FALSE FALSE NA NA
## S.npnct01.log FALSE FALSE NA NA
## A.T.will FALSE FALSE NA NA
## S.T.show FALSE FALSE NA NA
## H.T.new FALSE FALSE NA NA
## .rnorm FALSE FALSE NA NA
## H.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## S.ndgts.log FALSE FALSE NA NA
## H.T.say FALSE FALSE NA NA
## A.T.first FALSE FALSE NA NA
## A.T.photo FALSE FALSE NA NA
## H.T.china FALSE FALSE NA NA
## H.npnct01.log FALSE FALSE NA NA
## H.T.make FALSE FALSE NA NA
## A.T.senat FALSE FALSE NA NA
## S.T.said FALSE FALSE NA NA
## S.T.day FALSE FALSE NA NA
## H.npnct28.log FALSE FALSE NA NA
## H.T.news FALSE FALSE NA NA
## H.npnct16.log FALSE FALSE NA NA
## H.T.take TRUE FALSE NA NA
## S.npnct12.log FALSE FALSE NA NA
## H.T.busi FALSE FALSE NA NA
## A.T.compani FALSE FALSE NA NA
## S.npnct11.log FALSE FALSE NA NA
## H.T.day FALSE FALSE NA NA
## A.T.word FALSE FALSE NA NA
## H.P.facts.figures FALSE FALSE NA NA
## H.T.X2014 FALSE FALSE NA NA
## PubDate.last1.log FALSE FALSE NA NA
## S.T.obama FALSE FALSE NA NA
## PubDate.date.fctr FALSE FALSE NA NA
## H.T.big FALSE FALSE NA NA
## S.npnct14.log FALSE FALSE NA NA
## A.npnct16.log TRUE FALSE NA NA
## S.npnct06.log FALSE FALSE NA NA
## S.T.appear FALSE FALSE NA NA
## PubDate.last100.log TRUE FALSE NA NA
## PubDate.wkend FALSE FALSE NA NA
## H.T.ebola FALSE FALSE NA NA
## H.nwrds.log FALSE FALSE NA NA
## H.T.obama FALSE FALSE NA NA
## A.T.year FALSE FALSE NA NA
## A.nchrs.log FALSE FALSE NA NA
## H.T.test FALSE FALSE NA NA
## A.T.week FALSE FALSE NA NA
## H.T.pictur FALSE FALSE NA NA
## S.nwrds.log FALSE FALSE NA NA
## H.T.newyorktim FALSE FALSE NA NA
## S.npnct15.log FALSE FALSE NA NA
## H.T.bank FALSE FALSE NA NA
## H.T.billion FALSE FALSE NA NA
## S.T.new FALSE FALSE NA NA
## A.T.fashion FALSE FALSE NA NA
## H.P.fashion.week FALSE FALSE NA NA
## S.T.archiv FALSE FALSE NA NA
## S.T.herald FALSE FALSE NA NA
## H.T.springsumm FALSE FALSE NA NA
## S.T.tribun FALSE FALSE NA NA
## H.T.deal FALSE FALSE NA NA
## H.P.first.draft FALSE FALSE NA NA
## S.npnct28.log FALSE FALSE NA NA
## H.P.daily.clip.report FALSE FALSE NA NA
## H.P.today.in.smallbusiness FALSE FALSE NA NA
## H.P.verbatim.colon FALSE FALSE NA NA
## S.P.first.draft FALSE FALSE NA NA
## H.npnct02.log FALSE FALSE NA NA
## H.P.quandary FALSE FALSE NA NA
## S.npnct20.log FALSE FALSE NA NA
## S.npnct03.log FALSE FALSE NA NA
## A.npnct18.log FALSE FALSE NA NA
## A.T.presid TRUE FALSE NA NA
## S.T.presid TRUE FALSE NA NA
## S.P.year.colon FALSE FALSE NA NA
## H.P.on.this.day FALSE FALSE NA NA
## H.npnct05.log FALSE FALSE NA NA
## S.npnct07.log FALSE FALSE NA NA
## S.P.fashion.week FALSE FALSE NA NA
## H.P.s.notebook TRUE FALSE NA NA
## .clusterid FALSE FALSE NA NA
## A.ndgts.log FALSE FALSE NA NA
## A.npnct01.log FALSE FALSE NA NA
## A.npnct02.log FALSE FALSE NA NA
## A.npnct03.log FALSE FALSE NA NA
## A.npnct04.log FALSE FALSE NA NA
## A.npnct05.log NA FALSE NA NA
## A.npnct06.log FALSE FALSE NA NA
## A.npnct07.log FALSE FALSE NA NA
## A.npnct08.log TRUE FALSE NA NA
## A.npnct09.log NA FALSE NA NA
## A.npnct10.log TRUE FALSE NA NA
## A.npnct11.log FALSE FALSE NA NA
## A.npnct12.log FALSE FALSE NA NA
## A.npnct14.log FALSE FALSE NA NA
## A.npnct15.log FALSE FALSE NA NA
## A.npnct17.log FALSE FALSE NA NA
## A.npnct20.log FALSE FALSE NA NA
## A.npnct21.log FALSE FALSE NA NA
## A.npnct22.log NA FALSE NA NA
## A.npnct23.log FALSE FALSE NA NA
## A.npnct24.log TRUE FALSE NA NA
## A.npnct25.log TRUE FALSE NA NA
## A.npnct26.log NA FALSE NA NA
## A.npnct27.log NA FALSE NA NA
## A.npnct28.log FALSE FALSE NA NA
## A.npnct29.log NA FALSE NA NA
## A.npnct30.log NA FALSE NA NA
## A.nstopwrds.log FALSE FALSE NA NA
## A.nuppr.log FALSE FALSE NA NA
## A.nwrds.log FALSE FALSE NA NA
## A.P.daily.clip.report FALSE FALSE NA NA
## A.P.fashion.week FALSE FALSE NA NA
## A.P.first.draft FALSE FALSE NA NA
## A.P.http FALSE FALSE NA NA
## A.P.metropolitan.diary.colon FALSE FALSE NA NA
## A.P.year.colon FALSE FALSE NA NA
## A.ratio.nstopwrds.nwrds FALSE FALSE NA NA
## A.ratio.sum.TfIdf.nwrds FALSE FALSE NA NA
## A.T.appear FALSE FALSE NA NA
## A.T.archiv FALSE FALSE NA NA
## A.T.can FALSE FALSE NA NA
## A.T.day FALSE FALSE NA NA
## A.T.diari FALSE FALSE NA NA
## A.T.herald FALSE FALSE NA NA
## A.T.make FALSE FALSE NA NA
## A.T.new FALSE FALSE NA NA
## A.T.obama FALSE FALSE NA NA
## A.T.one FALSE FALSE NA NA
## A.T.said FALSE FALSE NA NA
## A.T.share FALSE FALSE NA NA
## A.T.show FALSE FALSE NA NA
## A.T.take FALSE FALSE NA NA
## A.T.time FALSE FALSE NA NA
## A.T.tribun FALSE FALSE NA NA
## H.nchrs.log FALSE FALSE NA NA
## H.npnct03.log FALSE FALSE NA NA
## H.npnct04.log FALSE FALSE NA NA
## H.npnct06.log FALSE FALSE NA NA
## H.npnct09.log NA FALSE NA NA
## H.npnct10.log TRUE FALSE NA NA
## H.npnct14.log FALSE FALSE NA NA
## H.npnct17.log NA FALSE NA NA
## H.npnct18.log NA FALSE NA NA
## H.npnct20.log TRUE FALSE NA NA
## H.npnct21.log NA FALSE NA NA
## H.npnct22.log NA FALSE NA NA
## H.npnct23.log NA FALSE NA NA
## H.npnct24.log TRUE FALSE NA NA
## H.npnct25.log NA FALSE NA NA
## H.npnct26.log NA FALSE NA NA
## H.npnct27.log NA FALSE NA NA
## H.npnct29.log NA FALSE NA NA
## H.npnct30.log NA FALSE NA NA
## H.nwrds.unq.log FALSE FALSE NA NA
## H.P.http NA FALSE NA NA
## H.P.today.in.politic FALSE FALSE NA NA
## H.P.what.we.are FALSE FALSE NA NA
## H.P.year.colon FALSE FALSE NA NA
## H.T.clip FALSE FALSE NA NA
## H.T.daili FALSE FALSE NA NA
## H.T.fashion FALSE FALSE NA NA
## H.T.first FALSE FALSE NA NA
## H.T.morn FALSE FALSE NA NA
## H.T.today FALSE FALSE NA NA
## H.T.X2015 FALSE FALSE NA NA
## Popular FALSE TRUE NA NA
## Popular.fctr NA NA NA TRUE
## PubDate.last1 FALSE FALSE NA NA
## PubDate.last10 FALSE FALSE NA NA
## PubDate.last100 FALSE FALSE NA NA
## PubDate.month.fctr FALSE FALSE NA NA
## PubDate.POSIX FALSE FALSE NA NA
## PubDate.year.fctr NA FALSE NA NA
## PubDate.zoo FALSE FALSE NA NA
## S.nchrs.log FALSE FALSE NA NA
## S.npnct02.log TRUE FALSE NA NA
## S.npnct05.log NA FALSE NA NA
## S.npnct09.log NA FALSE NA NA
## S.npnct10.log TRUE FALSE NA NA
## S.npnct13.log FALSE FALSE NA NA
## S.npnct16.log TRUE FALSE NA NA
## S.npnct17.log NA FALSE NA NA
## S.npnct18.log NA FALSE NA NA
## S.npnct19.log FALSE FALSE NA NA
## S.npnct21.log FALSE FALSE NA NA
## S.npnct22.log NA FALSE NA NA
## S.npnct23.log FALSE FALSE NA NA
## S.npnct24.log TRUE FALSE NA NA
## S.npnct25.log NA FALSE NA NA
## S.npnct26.log NA FALSE NA NA
## S.npnct27.log NA FALSE NA NA
## S.npnct29.log NA FALSE NA NA
## S.npnct30.log NA FALSE NA NA
## S.nwrds.unq.log FALSE FALSE NA NA
## S.P.daily.clip.report FALSE FALSE NA NA
## S.P.http NA FALSE NA NA
## S.sum.TfIdf FALSE FALSE NA NA
## S.T.articl FALSE FALSE NA NA
## S.T.compani FALSE FALSE NA NA
## S.T.fashion FALSE FALSE NA NA
## S.T.first FALSE FALSE NA NA
## S.T.intern FALSE FALSE NA NA
## S.T.newyork FALSE FALSE NA NA
## S.T.newyorktim FALSE FALSE NA NA
## S.T.photo FALSE FALSE NA NA
## S.T.report FALSE FALSE NA NA
## S.T.senat FALSE FALSE NA NA
## S.T.week FALSE FALSE NA NA
## S.T.will FALSE FALSE NA NA
## S.T.word FALSE FALSE NA NA
## S.T.year FALSE FALSE NA NA
## UniqueID FALSE FALSE TRUE NA
## WordCount FALSE FALSE NA NA
## Low.cor.X.glm.importance Final.glm.importance
## WordCount.log 1.000000e+02 1.000000e+02
## H.P.readers.respond 5.146747e+01 5.146747e+01
## myCategory.fctr 4.281250e+01 4.281250e+01
## H.npnct19.log 4.004695e+01 4.004695e+01
## H.npnct15.log 3.467378e+01 3.467378e+01
## .clusterid.fctr 2.932308e+01 2.932308e+01
## A.npnct13.log 2.871074e+01 2.871074e+01
## A.npnct19.log 2.850228e+01 2.850228e+01
## S.nuppr.log 2.501950e+01 2.501950e+01
## S.T.diari 2.124067e+01 2.124067e+01
## H.T.word 2.105883e+01 2.105883e+01
## H.npnct08.log 2.045852e+01 2.045852e+01
## H.T.read 2.019170e+01 2.019170e+01
## H.ndgts.log 1.951144e+01 1.951144e+01
## S.P.metropolitan.diary.colon 1.888990e+01 1.888990e+01
## S.ratio.sum.TfIdf.nwrds 1.854118e+01 1.854118e+01
## A.T.newyork 1.825819e+01 1.825819e+01
## H.nuppr.log 1.817153e+01 1.817153e+01
## S.T.make 1.723536e+01 1.723536e+01
## PubDate.wkday.fctr 1.661497e+01 1.661497e+01
## H.nstopwrds.log 1.627434e+01 1.627434e+01
## H.ratio.nstopwrds.nwrds 1.623508e+01 1.623508e+01
## H.npnct11.log 1.582943e+01 1.582943e+01
## S.T.can 1.557214e+01 1.557214e+01
## H.P.no.comment.colon 1.552028e+01 1.552028e+01
## H.P.friday.night.music 1.488096e+01 1.488096e+01
## A.T.newyorktim 1.471359e+01 1.471359e+01
## S.npnct04.log 1.357564e+01 1.357564e+01
## H.T.newyork 1.337672e+01 1.337672e+01
## S.T.share 1.322366e+01 1.322366e+01
## S.npnct08.log 1.319408e+01 1.319408e+01
## H.sum.TfIdf 1.282767e+01 1.282767e+01
## H.P.recap.colon 1.273820e+01 1.273820e+01
## S.T.one 1.238107e+01 1.238107e+01
## H.npnct07.log 1.222449e+01 1.222449e+01
## PubDate.last10.log 1.193952e+01 1.193952e+01
## H.T.report 1.191324e+01 1.191324e+01
## A.nwrds.unq.log 1.190743e+01 1.190743e+01
## A.T.report 1.155728e+01 1.155728e+01
## PubDate.hour.fctr 1.144174e+01 1.144174e+01
## A.T.articl 1.128851e+01 1.128851e+01
## A.sum.TfIdf 1.110991e+01 1.110991e+01
## S.nstopwrds.log 1.099737e+01 1.099737e+01
## PubDate.minute.fctr 1.055593e+01 1.055593e+01
## H.T.polit 1.032948e+01 1.032948e+01
## S.ratio.nstopwrds.nwrds 1.026798e+01 1.026798e+01
## A.T.intern 9.914726e+00 9.914726e+00
## S.T.time 9.904830e+00 9.904830e+00
## H.npnct12.log 9.888614e+00 9.888614e+00
## S.T.take 9.852005e+00 9.852005e+00
## H.T.art 9.842033e+00 9.842033e+00
## H.npnct13.log 9.829552e+00 9.829552e+00
## PubDate.second.fctr 9.794379e+00 9.794379e+00
## H.T.week 9.718351e+00 9.718351e+00
## H.T.get 9.576775e+00 9.576775e+00
## S.npnct01.log 9.564919e+00 9.564919e+00
## A.T.will 9.491070e+00 9.491070e+00
## S.T.show 9.342530e+00 9.342530e+00
## H.T.new 9.203584e+00 9.203584e+00
## .rnorm 8.941026e+00 8.941026e+00
## H.ratio.sum.TfIdf.nwrds 8.939183e+00 8.939183e+00
## S.ndgts.log 8.755923e+00 8.755923e+00
## H.T.say 8.671813e+00 8.671813e+00
## A.T.first 8.505413e+00 8.505413e+00
## A.T.photo 8.036467e+00 8.036467e+00
## H.T.china 7.799595e+00 7.799595e+00
## H.npnct01.log 7.787548e+00 7.787548e+00
## H.T.make 7.674641e+00 7.674641e+00
## A.T.senat 7.437682e+00 7.437682e+00
## S.T.said 7.364645e+00 7.364645e+00
## S.T.day 7.260435e+00 7.260435e+00
## H.npnct28.log 7.016220e+00 7.016220e+00
## H.T.news 6.901869e+00 6.901869e+00
## H.npnct16.log 6.896467e+00 6.896467e+00
## H.T.take 6.881392e+00 6.881392e+00
## S.npnct12.log 6.296543e+00 6.296543e+00
## H.T.busi 5.892173e+00 5.892173e+00
## A.T.compani 5.701348e+00 5.701348e+00
## S.npnct11.log 5.458567e+00 5.458567e+00
## H.T.day 5.341694e+00 5.341694e+00
## A.T.word 5.341188e+00 5.341188e+00
## H.P.facts.figures 5.332319e+00 5.332319e+00
## H.T.X2014 5.105515e+00 5.105515e+00
## PubDate.last1.log 5.029197e+00 5.029197e+00
## S.T.obama 4.950818e+00 4.950818e+00
## PubDate.date.fctr 4.692266e+00 4.692266e+00
## H.T.big 4.491165e+00 4.491165e+00
## S.npnct14.log 4.449833e+00 4.449833e+00
## A.npnct16.log 4.218694e+00 4.218694e+00
## S.npnct06.log 3.899006e+00 3.899006e+00
## S.T.appear 3.840502e+00 3.840502e+00
## PubDate.last100.log 3.387278e+00 3.387278e+00
## PubDate.wkend 3.323031e+00 3.323031e+00
## H.T.ebola 3.198159e+00 3.198159e+00
## H.nwrds.log 3.088101e+00 3.088101e+00
## H.T.obama 2.703165e+00 2.703165e+00
## A.T.year 2.411870e+00 2.411870e+00
## A.nchrs.log 2.210854e+00 2.210854e+00
## H.T.test 1.856699e+00 1.856699e+00
## A.T.week 1.735453e+00 1.735453e+00
## H.T.pictur 1.728117e+00 1.728117e+00
## S.nwrds.log 1.511344e+00 1.511344e+00
## H.T.newyorktim 1.280498e+00 1.280498e+00
## S.npnct15.log 1.190329e+00 1.190329e+00
## H.T.bank 1.038507e+00 1.038507e+00
## H.T.billion 7.999936e-01 7.999936e-01
## S.T.new 1.565429e-01 1.565429e-01
## A.T.fashion 1.528913e-01 1.528913e-01
## H.P.fashion.week 1.109378e-01 1.109378e-01
## S.T.archiv 9.386317e-02 9.386317e-02
## S.T.herald 8.612681e-02 8.612681e-02
## H.T.springsumm 7.774263e-02 7.774263e-02
## S.T.tribun 7.345077e-02 7.345077e-02
## H.T.deal 6.047754e-02 6.047754e-02
## H.P.first.draft 4.572882e-02 4.572882e-02
## S.npnct28.log 4.463511e-02 4.463511e-02
## H.P.daily.clip.report 3.416759e-02 3.416759e-02
## H.P.today.in.smallbusiness 2.839539e-02 2.839539e-02
## H.P.verbatim.colon 1.634958e-02 1.634958e-02
## S.P.first.draft 1.547771e-02 1.547771e-02
## H.npnct02.log 1.526519e-02 1.526519e-02
## H.P.quandary 1.236942e-02 1.236942e-02
## S.npnct20.log 1.220902e-02 1.220902e-02
## S.npnct03.log 1.207385e-02 1.207385e-02
## A.npnct18.log 1.043696e-02 1.043696e-02
## A.T.presid 9.440632e-03 9.440632e-03
## S.T.presid 9.421346e-03 9.421346e-03
## S.P.year.colon 6.252341e-03 6.252341e-03
## H.P.on.this.day 5.059061e-03 5.059061e-03
## H.npnct05.log 4.614811e-03 4.614811e-03
## S.npnct07.log 2.450155e-03 2.450155e-03
## S.P.fashion.week 8.838548e-04 8.838548e-04
## H.P.s.notebook 0.000000e+00 0.000000e+00
## .clusterid NA NA
## A.ndgts.log NA NA
## A.npnct01.log NA NA
## A.npnct02.log NA NA
## A.npnct03.log NA NA
## A.npnct04.log NA NA
## A.npnct05.log NA NA
## A.npnct06.log NA NA
## A.npnct07.log NA NA
## A.npnct08.log NA NA
## A.npnct09.log NA NA
## A.npnct10.log NA NA
## A.npnct11.log NA NA
## A.npnct12.log NA NA
## A.npnct14.log NA NA
## A.npnct15.log NA NA
## A.npnct17.log NA NA
## A.npnct20.log NA NA
## A.npnct21.log NA NA
## A.npnct22.log NA NA
## A.npnct23.log NA NA
## A.npnct24.log NA NA
## A.npnct25.log NA NA
## A.npnct26.log NA NA
## A.npnct27.log NA NA
## A.npnct28.log NA NA
## A.npnct29.log NA NA
## A.npnct30.log NA NA
## A.nstopwrds.log NA NA
## A.nuppr.log NA NA
## A.nwrds.log NA NA
## A.P.daily.clip.report NA NA
## A.P.fashion.week NA NA
## A.P.first.draft NA NA
## A.P.http NA NA
## A.P.metropolitan.diary.colon NA NA
## A.P.year.colon NA NA
## A.ratio.nstopwrds.nwrds NA NA
## A.ratio.sum.TfIdf.nwrds NA NA
## A.T.appear NA NA
## A.T.archiv NA NA
## A.T.can NA NA
## A.T.day NA NA
## A.T.diari NA NA
## A.T.herald NA NA
## A.T.make NA NA
## A.T.new NA NA
## A.T.obama NA NA
## A.T.one NA NA
## A.T.said NA NA
## A.T.share NA NA
## A.T.show NA NA
## A.T.take NA NA
## A.T.time NA NA
## A.T.tribun NA NA
## H.nchrs.log NA NA
## H.npnct03.log NA NA
## H.npnct04.log NA NA
## H.npnct06.log NA NA
## H.npnct09.log NA NA
## H.npnct10.log NA NA
## H.npnct14.log NA NA
## H.npnct17.log NA NA
## H.npnct18.log NA NA
## H.npnct20.log NA NA
## H.npnct21.log NA NA
## H.npnct22.log NA NA
## H.npnct23.log NA NA
## H.npnct24.log NA NA
## H.npnct25.log NA NA
## H.npnct26.log NA NA
## H.npnct27.log NA NA
## H.npnct29.log NA NA
## H.npnct30.log NA NA
## H.nwrds.unq.log NA NA
## H.P.http NA NA
## H.P.today.in.politic NA NA
## H.P.what.we.are NA NA
## H.P.year.colon NA NA
## H.T.clip NA NA
## H.T.daili NA NA
## H.T.fashion NA NA
## H.T.first NA NA
## H.T.morn NA NA
## H.T.today NA NA
## H.T.X2015 NA NA
## Popular NA NA
## Popular.fctr NA NA
## PubDate.last1 NA NA
## PubDate.last10 NA NA
## PubDate.last100 NA NA
## PubDate.month.fctr NA NA
## PubDate.POSIX NA NA
## PubDate.year.fctr NA NA
## PubDate.zoo NA NA
## S.nchrs.log NA NA
## S.npnct02.log NA NA
## S.npnct05.log NA NA
## S.npnct09.log NA NA
## S.npnct10.log NA NA
## S.npnct13.log NA NA
## S.npnct16.log NA NA
## S.npnct17.log NA NA
## S.npnct18.log NA NA
## S.npnct19.log NA NA
## S.npnct21.log NA NA
## S.npnct22.log NA NA
## S.npnct23.log NA NA
## S.npnct24.log NA NA
## S.npnct25.log NA NA
## S.npnct26.log NA NA
## S.npnct27.log NA NA
## S.npnct29.log NA NA
## S.npnct30.log NA NA
## S.nwrds.unq.log NA NA
## S.P.daily.clip.report NA NA
## S.P.http NA NA
## S.sum.TfIdf NA NA
## S.T.articl NA NA
## S.T.compani NA NA
## S.T.fashion NA NA
## S.T.first NA NA
## S.T.intern NA NA
## S.T.newyork NA NA
## S.T.newyorktim NA NA
## S.T.photo NA NA
## S.T.report NA NA
## S.T.senat NA NA
## S.T.week NA NA
## S.T.will NA NA
## S.T.word NA NA
## S.T.year NA NA
## UniqueID NA NA
## WordCount NA NA
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 132
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## 4 4 Y 1.935740e-01
## 1923 1923 Y 2.336116e-01
## 33 33 Y 7.411922e-01
## 1507 1507 N 4.246347e-05
## 6370 6370 Y 6.635282e-01
## 1334 1334 N 3.186994e-01
## 24 24 N 4.006257e-01
## 386 386 N 4.171079e-01
## 5560 5560 N 4.539239e-01
## 855 855 N 4.911909e-01
## 7 7 N 5.231005e-01
## 20 20 N 5.810526e-01
## 15 15 N 8.968103e-01
## 17 17 N 9.619673e-01
## 2018 2018 N 9.934248e-01
## Popular.fctr.predict.Final.glm
## 4 N
## 1923 N
## 33 Y
## 1507 N
## 6370 Y
## 1334 Y
## 24 Y
## 386 Y
## 5560 Y
## 855 Y
## 7 Y
## 20 Y
## 15 Y
## 17 Y
## 2018 Y
## Popular.fctr.predict.Final.glm.accurate
## 4 FALSE
## 1923 FALSE
## 33 TRUE
## 1507 TRUE
## 6370 TRUE
## 1334 FALSE
## 24 FALSE
## 386 FALSE
## 5560 FALSE
## 855 FALSE
## 7 FALSE
## 20 FALSE
## 15 FALSE
## 17 FALSE
## 2018 FALSE
## Popular.fctr.predict.Final.glm.error .label
## 4 -0.10642604 4
## 1923 -0.06638843 1923
## 33 0.00000000 33
## 1507 0.00000000 1507
## 6370 0.00000000 6370
## 1334 0.01869940 1334
## 24 0.10062572 24
## 386 0.11710786 386
## 5560 0.15392391 5560
## 855 0.19119091 855
## 7 0.22310053 7
## 20 0.28105257 20
## 15 0.59681034 15
## 17 0.66196733 17
## 2018 0.69342484 2018
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## 2182 2182 Y 0.000794161
## 4020 4020 Y 0.001186979
## 3554 3554 Y 0.001897312
## 5486 5486 Y 0.004738465
## 4352 4352 Y 0.005823666
## 6441 6441 Y 0.006445450
## Popular.fctr.predict.Final.glm
## 2182 N
## 4020 N
## 3554 N
## 5486 N
## 4352 N
## 6441 N
## Popular.fctr.predict.Final.glm.accurate
## 2182 FALSE
## 4020 FALSE
## 3554 FALSE
## 5486 FALSE
## 4352 FALSE
## 6441 FALSE
## Popular.fctr.predict.Final.glm.error
## 2182 -0.2992058
## 4020 -0.2988130
## 3554 -0.2981027
## 5486 -0.2952615
## 4352 -0.2941763
## 6441 -0.2935546
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## 5911 5911 Y 0.05528951
## 6272 6272 N 0.34600193
## 5076 5076 N 0.42473612
## 1322 1322 N 0.67211031
## 1478 1478 N 0.81393448
## 6329 6329 N 0.97798211
## Popular.fctr.predict.Final.glm
## 5911 N
## 6272 Y
## 5076 Y
## 1322 Y
## 1478 Y
## 6329 Y
## Popular.fctr.predict.Final.glm.accurate
## 5911 FALSE
## 6272 FALSE
## 5076 FALSE
## 1322 FALSE
## 1478 FALSE
## 6329 FALSE
## Popular.fctr.predict.Final.glm.error
## 5911 -0.24471049
## 6272 0.04600193
## 5076 0.12473612
## 1322 0.37211031
## 1478 0.51393448
## 6329 0.67798211
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## 770 770 N 0.9788535
## 1085 1085 N 0.9793714
## 4975 4975 N 0.9879920
## 1667 1667 N 0.9911922
## 2018 2018 N 0.9934248
## 59 59 N 0.9939983
## Popular.fctr.predict.Final.glm
## 770 Y
## 1085 Y
## 4975 Y
## 1667 Y
## 2018 Y
## 59 Y
## Popular.fctr.predict.Final.glm.accurate
## 770 FALSE
## 1085 FALSE
## 4975 FALSE
## 1667 FALSE
## 2018 FALSE
## 59 FALSE
## Popular.fctr.predict.Final.glm.error
## 770 0.6788535
## 1085 0.6793714
## 4975 0.6879920
## 1667 0.6911922
## 2018 0.6934248
## 59 0.6939983
dsp_feats_vctr <- c(NULL)
for(var in grep(".importance", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
print(glb_trnobs_df[glb_trnobs_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_trnobs_df), value=TRUE)])
## Popular.fctr Popular.fctr.predict.Final.glm.prob
## 92 Y 0.040556020
## 693 Y 0.055687379
## 4020 Y 0.001186979
## 4721 Y 0.008424242
## Popular.fctr.predict.Final.glm
## 92 N
## 693 N
## 4020 N
## 4721 N
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "Popular.fctr.predict.Final.glm.prob"
## [2] "Popular.fctr.predict.Final.glm"
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 15 fit.data.training 8 1 558.729 569.943 11.214
## 16 predict.data.new 9 0 569.943 NA NA
9.0: predict data new# Compute final model predictions
glb_newobs_df <- glb_get_predictions(glb_newobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(glb_newobs_df, mdl_id = glb_fin_mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.3
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
glb_analytics_diag_plots(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_newobs_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 132
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## 6630 6630 <NA> 6.176454e-13
## 6753 6753 <NA> 9.103801e-01
## 7056 7056 <NA> 2.929341e-01
## 7309 7309 <NA> 1.145708e-04
## Popular.fctr.predict.Final.glm
## 6630 N
## 6753 Y
## 7056 N
## 7309 N
## Popular.fctr.predict.Final.glm.accurate
## 6630 NA
## 6753 NA
## 7056 NA
## 7309 NA
## Popular.fctr.predict.Final.glm.error .label
## 6630 0 6630
## 6753 0 6753
## 7056 0 7056
## 7309 0 7309
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## NA NA <NA> NA
## NA.1 NA <NA> NA
## NA.2 NA <NA> NA
## NA.3 NA <NA> NA
## NA.4 NA <NA> NA
## NA.5 NA <NA> NA
## Popular.fctr.predict.Final.glm
## NA <NA>
## NA.1 <NA>
## NA.2 <NA>
## NA.3 <NA>
## NA.4 <NA>
## NA.5 <NA>
## Popular.fctr.predict.Final.glm.accurate
## NA NA
## NA.1 NA
## NA.2 NA
## NA.3 NA
## NA.4 NA
## NA.5 NA
## Popular.fctr.predict.Final.glm.error
## NA NA
## NA.1 NA
## NA.2 NA
## NA.3 NA
## NA.4 NA
## NA.5 NA
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## NA.168 NA <NA> NA
## NA.243 NA <NA> NA
## NA.261 NA <NA> NA
## NA.1464 NA <NA> NA
## NA.1493 NA <NA> NA
## NA.1501 NA <NA> NA
## Popular.fctr.predict.Final.glm
## NA.168 <NA>
## NA.243 <NA>
## NA.261 <NA>
## NA.1464 <NA>
## NA.1493 <NA>
## NA.1501 <NA>
## Popular.fctr.predict.Final.glm.accurate
## NA.168 NA
## NA.243 NA
## NA.261 NA
## NA.1464 NA
## NA.1493 NA
## NA.1501 NA
## Popular.fctr.predict.Final.glm.error
## NA.168 NA
## NA.243 NA
## NA.261 NA
## NA.1464 NA
## NA.1493 NA
## NA.1501 NA
## UniqueID Popular.fctr Popular.fctr.predict.Final.glm.prob
## NA.1864 NA <NA> NA
## NA.1865 NA <NA> NA
## NA.1866 NA <NA> NA
## NA.1867 NA <NA> NA
## NA.1868 NA <NA> NA
## NA.1869 NA <NA> NA
## Popular.fctr.predict.Final.glm
## NA.1864 <NA>
## NA.1865 <NA>
## NA.1866 <NA>
## NA.1867 <NA>
## NA.1868 <NA>
## NA.1869 <NA>
## Popular.fctr.predict.Final.glm.accurate
## NA.1864 NA
## NA.1865 NA
## NA.1866 NA
## NA.1867 NA
## NA.1868 NA
## NA.1869 NA
## Popular.fctr.predict.Final.glm.error
## NA.1864 NA
## NA.1865 NA
## NA.1866 NA
## NA.1867 NA
## NA.1868 NA
## NA.1869 NA
## Warning in loop_apply(n, do.ply): Removed 1870 rows containing missing
## values (geom_point).
submit_df <- glb_newobs_df[, c(glb_id_vars,
paste0(glb_rsp_var_out, glb_fin_mdl_id, ".prob"))]
names(submit_df)[2] <- "Probability1"
write.csv(submit_df,
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_fin_mdl_id), fixed=TRUE),
"_submit.csv"), row.names=FALSE)
# print(orderBy(~ -max.auc.OOB, glb_models_df[, c("model_id",
# "max.auc.OOB", "max.Accuracy.OOB")]))
print(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"])
## [1] 0.3
print(sprintf("glb_sel_mdl_id: %s", glb_sel_mdl_id))
## [1] "glb_sel_mdl_id: Low.cor.X.glm"
print(sprintf("glb_fin_mdl_id: %s", glb_fin_mdl_id))
## [1] "glb_fin_mdl_id: Final.glm"
print(dim(glb_fitobs_df))
## [1] 4475 276
print(dsp_models_df)
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 8 Low.cor.X.glm 0.9095771 0.9229354 0.6890845
## 9 All.X.glm 0.8988819 0.9159268 0.6561351
## 10 All.X.no.rnorm.rpart 0.8862421 0.7084504 0.5054039
## 1 MFO.myMFO_classfr 0.8327662 0.5000000 0.0000000
## 3 Max.cor.Y.cv.0.rpart 0.8327662 0.5000000 0.0000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.8327662 0.5000000 0.0000000
## 5 Max.cor.Y.rpart 0.8327662 0.5000000 0.0000000
## 7 Interact.High.cor.Y.glm 0.8021390 0.7911694 0.3451612
## 6 Max.cor.Y.glm 0.7316480 0.7102060 0.2283681
## 2 Random.myrandom_classfr 0.1672338 0.4877001 0.0000000
## min.aic.fit opt.prob.threshold.OOB
## 8 2088.981 0.3
## 9 2140.884 0.3
## 10 NA 0.7
## 1 NA 0.5
## 3 NA 0.5
## 4 NA 0.5
## 5 NA 0.5
## 7 3300.299 0.3
## 6 3714.601 0.2
## 2 NA 0.1
print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id))
## [1] "Low.cor.X.glm OOB confusion matrix & accuracy: "
print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[, glb_rsp_var])$table))
## Prediction
## Reference N Y
## N 1601 112
## Y 74 270
tmp_OOBent_df <- glb_OOBobs_df[, c("myCategory", predct_accurate_var_name)]
names(tmp_OOBent_df)[2] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBent_df, names(tmp_OOBent_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
## [1] "myCategory" ".n.OOB"
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
## myCategory .n.OOB .n.Tst .freqRatio.Tst
## 1 ## 407 338 0.180748663
## 15 OpEd#Opinion# 154 164 0.087700535
## 6 Business#Business Day#Dealbook 312 304 0.162566845
## 9 Business#Technology# 114 113 0.060427807
## 18 Styles#U.S.# 54 62 0.033155080
## 16 Science#Health# 66 57 0.030481283
## 10 Culture#Arts# 225 244 0.130481283
## 8 Business#Crosswords/Games# 40 42 0.022459893
## 13 Metro#N.Y. / Region# 60 67 0.035828877
## 4 #Opinion#The Public Editor 10 10 0.005347594
## 20 TStyle## 221 105 0.056149733
## 7 Business#Business Day#Small Business 45 42 0.022459893
## 3 #Opinion#Room For Debate 21 24 0.012834225
## 17 Styles##Fashion 41 15 0.008021390
## 2 #Multimedia# 42 52 0.027807487
## 5 #U.S.#Education 93 90 0.048128342
## 11 Foreign#World# 47 47 0.025133690
## 12 Foreign#World#Asia Pacific 61 56 0.029946524
## 14 myOther 13 3 0.001604278
## 19 Travel#Travel# 31 35 0.018716578
## .freqRatio.OOB accurate.OOB.FALSE accurate.OOB.TRUE max.accuracy.OOB
## 1 0.197860963 35 372 0.9140049
## 15 0.074866310 29 125 0.8116883
## 6 0.151677200 27 285 0.9134615
## 9 0.055420515 23 91 0.7982456
## 18 0.026251823 18 36 0.6666667
## 16 0.032085561 17 49 0.7424242
## 10 0.109382596 15 210 0.9333333
## 8 0.019445795 7 33 0.8250000
## 13 0.029168692 5 55 0.9166667
## 4 0.004861449 3 7 0.7000000
## 20 0.107438017 3 218 0.9864253
## 7 0.021876519 2 43 0.9555556
## 3 0.010209042 1 20 0.9523810
## 17 0.019931940 1 40 0.9756098
## 2 0.020418085 0 42 1.0000000
## 5 0.045211473 0 93 1.0000000
## 11 0.022848809 0 47 1.0000000
## 12 0.029654837 0 61 1.0000000
## 14 0.006319883 0 13 1.0000000
## 19 0.015070491 0 31 1.0000000
dsp_myCategory_conf_mtrx <- function(myCategory) {
print(sprintf("%s OOB::myCategory=%s confusion matrix & accuracy: ",
glb_sel_mdl_id, myCategory))
print(t(confusionMatrix(
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, glb_rsp_var])$table))
print(sum(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
predct_accurate_var_name]) /
nrow(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, ]))
err_ids <- glb_OOBobs_df[(glb_OOBobs_df$myCategory == myCategory) &
(!glb_OOBobs_df[, predct_accurate_var_name]), glb_id_vars]
OOB_FNerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 1),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FN errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FNerr_df)))
print(OOB_FNerr_df)
OOB_FPerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 0),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FP errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FPerr_df)))
print(OOB_FPerr_df)
}
#dsp_myCategory_conf_mtrx(myCategory="OpEd#Opinion#")
#dsp_myCategory_conf_mtrx(myCategory="Business#Business Day#Dealbook")
dsp_myCategory_conf_mtrx(myCategory="##")
## [1] "Low.cor.X.glm OOB::myCategory=## confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 353 19
## Y 16 19
## [1] 0.9140049
## [1] "Low.cor.X.glm OOB::myCategory=## FN errors: 16"
## .clusterid Popular
## 163 1503 1
## 685 1509 1
## 1092 1522 1
## 1404 1505 1
## 1489 1513 1
## 2088 1517 1
## 3074 1512 1
## 3263 1510 1
## 3235 1522 1
## 3472 1521 1
## 3492 1513 1
## 3882 1512 1
## 4173 1510 1
## 4632 1512 1
## 4775 1521 1
## 5058 1516 1
## Headline
## 163 Why Leaked Nude Photos Are Another Frontier for Feminists
## 685 What Janay Rice Wants
## 1092 How to Fake Your Next Vacation
## 1404 When Tips Are Not Enough
## 1489 What to Expect From Narendra Modi at the United Nations
## 2088 Narendra Modi, in U.N. Speech, Inserts India Into Terrorism Fight
## 3074 Inside the Bounds of a Hasidic Neighborhood
## 3263 Discussion of Pedophilia Turns Heated
## 3235 Columbus Day, or Indigenous Peoples Day?
## 3472 Yes Means Yes: The Big Consent Debate
## 3492 Life and Death Through the Eyes of an Ebola Nurse
## 3882 To Die at Home: Reporters Notebook
## 4173 Should a Child Offender Be Treated as an Adult?
## 4632 Tim Cook's Coming Out: Reporter's Notebook
## 4775 A Midterm Election Quiz: Testing Our Political Assumptions
## 5058 Sexual Harassment at Yale: Delicate Subject, High-Impact Investigation
## Snippet
## 163 After nude photos of famous women were leaked on the Internet, many observers are asking if online harassment of celebrities and women can be reined in or punished.
## 685 What impact does the video of Ray Rice punching his wife, Janay, have on her, and should news outlets take that into consideration?
## 1092 A Dutch artist faked a trip to Southeast Asia through a series of carefully manipulated Facebook photos, demonstrating the power and ubiquity of deception in the digital age.
## 1404 A recent campaign aims to convince hotel guests to tip more, but critics say tips are a poor substitute for wages.
## 1489 Since taking power in May, Prime Minister Narendra Modi of India has had runaway success with major speeches in part because of their character colloquial, earthy and extemporaneous. Will that tone translate to the United Nations General...
## 2088 At the United Nations General Assembly, Prime Minister Narendra Modi of India signaled his support for the United States renewed focus on fighting terrorism.
## 3074 The Hasidic community is known for its traditions and insularity. The artist and journalist Annie Berman set out to break down the groups boundaries through an ad she places on Craigslist.
## 3263 An Op-Ed essay about what the author said were misconceptions about pedophilia drew more than 1,200 comments mostly negative, but with a wide range of views expressed.
## 3235 Some places have designated the second Monday in October elsewhere laid aside for Columbus Day as a new holiday for Native Americans, pitting members of the Italian-American community against American Indians.
## 3472 A longstanding debate about what constitutes consent in sexual interaction has been reignited by the conversation about sexual violence on campus and elsewhere.
## 3492 Ben C. Solomon is a Times video journalist reporting on Ebola. His video today, about a team of ambulance drivers in Monrovia, Liberia, shows the dangers they face every day.
## 3882 A journey taken with a woman and her dying father led Nina Bernstein to write an article that speaks to the challenges many readers also face.
## 4173 Or is it better for a 10-year-old boy who committed homicide or teenage football players who brutally hazed teammates to be placed in the juvenile justice system?
## 4632 James B. Stewart provides insight into his column about the Apple chief executives announcement that he is proud to be gay.
## 4775 Do you know whos a Republican or a liberal?
## 5058 Tamar Lewin talks about reporting on a sexual harassment case and the months it took to find people who would speak on the record and provide documents.
## Abstract
## 163 After nude photos of famous women were leaked on the Internet, many observers are asking if online harassment of celebrities and women can be reined in or punished.
## 685 What impact does the video of Ray Rice punching his wife, Janay, have on her, and should news outlets take that into consideration?
## 1092 A Dutch artist faked a trip to Southeast Asia through a series of carefully manipulated Facebook photos, demonstrating the power and ubiquity of deception in the digital age.
## 1404 A recent campaign aims to convince hotel guests to tip more, but critics say tips are a poor substitute for wages.
## 1489 Since taking power in May, Prime Minister Narendra Modi of India has had runaway success with major speeches in part because of their character colloquial, earthy and extemporaneous. Will that tone translate to the United Nations General Assembly?
## 2088 At the United Nations General Assembly, Prime Minister Narendra Modi of India signaled his support for the United States renewed focus on fighting terrorism.
## 3074 The Hasidic community is known for its traditions and insularity. The artist and journalist Annie Berman set out to break down the groups boundaries through an ad she places on Craigslist.
## 3263 An Op-Ed essay about what the author said were misconceptions about pedophilia drew more than 1,200 comments mostly negative, but with a wide range of views expressed.
## 3235 Some places have designated the second Monday in October elsewhere laid aside for Columbus Day as a new holiday for Native Americans, pitting members of the Italian-American community against American Indians.
## 3472 A longstanding debate about what constitutes consent in sexual interaction has been reignited by the conversation about sexual violence on campus and elsewhere.
## 3492 Ben C. Solomon is a Times video journalist reporting on Ebola. His video today, about a team of ambulance drivers in Monrovia, Liberia, shows the dangers they face every day.
## 3882 A journey taken with a woman and her dying father led Nina Bernstein to write an article that speaks to the challenges many readers also face.
## 4173 Or is it better for a 10-year-old boy who committed homicide or teenage football players who brutally hazed teammates to be placed in the juvenile justice system?
## 4632 James B. Stewart provides insight into his column about the Apple chief executives announcement that he is proud to be gay.
## 4775 Do you know whos a Republican or a liberal?
## 5058 Tamar Lewin talks about reporting on a sexual harassment case and the months it took to find people who would speak on the record and provide documents.
## [1] "Low.cor.X.glm OOB::myCategory=## FP errors: 19"
## .clusterid Popular
## 194 1524 0
## 359 1501 0
## 640 1522 0
## 962 1522 0
## 1322 1519 0
## 1355 1509 0
## 1685 1503 0
## 1905 1506 0
## 2122 1519 0
## 3670 1503 0
## 3783 1522 0
## 3948 1503 0
## 3946 1518 0
## 4572 1503 0
## 4911 1509 0
## 4955 1509 0
## 5470 1512 0
## 5625 1516 0
## 5834 1505 0
## Headline
## 194 Emoji for Breakfast, Dinosaur for Lunch
## 359 Watching the Sun Set on the Summer Movie Season
## 640 How Much of the Worst Summer of News Ever Can You Take?
## 962 What Moving the Capital Would Mean for Argentina
## 1322 Updates on the Scottish Independence Referendum
## 1355 Do You Need a Worse Phone?
## 1685 Who Needs a Cuddle Buddy? Everyone, It Turns Out
## 1905 Making Infrastructure Sexy
## 2122 Want a Robot Pen Pal?
## 3670 Will an App Change Your Writing Life?
## 3783 Is American Horror Story the Best (Weirdest) Show on Television?
## 3948 How Italy Became the Refugees Gateway to Europe
## 3946 S. Dakota Senate Race Remains Close Over EB-5 Scandal (Wait, What?)
## 4572 Talking to Bellevue Workers About Ebola's Stigma
## 4911 When Spell-Check Can’t Help
## 4955 When a Stranger Delivers Your Texts
## 5470 Will Serial Change How We Talk About Crime?
## 5625 Do You Want To Know About Your Brain?
## 5834 When Is a Chocolate Shortage a Good Thing?
## Snippet
## 194 From the emoji diet to Gwyneth Paltrows detox, tales of diet experiments can illuminate how we eat.
## 359 The Op-Ed columnists debate the union of church and real estate in Love Is Strange and explore future dystopias in The Giver. And one columnist pleads: Stop giving actors extra credit for playing gay.
## 640 This summer has given us horrific news story after horrific news story. Does this inevitably lead to compassion fatigue, and if so, how can we fight it?
## 962 Argentina may move its capital from bustling Buenos Aires to a provincial city in the sleepy north. What would this mean for the countrys infamously stormy economy and contentious political climate?
## 1322 The Times is providing updates on the counting of the ballots, which stretched into the early hours of Friday morning.
## 1355 The Apple Watch is coming, but some say less-sophisticated phones are the more sophisticated choice.
## 1685 A new app called Cuddlr helps users locate someone to cuddle with. The idea of cuddling with a total stranger may seem uncomfortable, but it may be more important than we think.
## 1905 The people behind the Living City videos talked about their adventures in filming.
## 2122 Robots can now write us letters. Will we read them?
## 3670 Apps like Lists for Writers aim to help with creativity, but some question whether thats really possible.
## 3783 The FX series has revived the anthology genre, and with subject matter that both fascinates and repulses. But is it a case of genuine creative genius, or running on the fumes of hype?
## 3948 A Vice documentary series highlights the plight of African and Middle Eastern refugees headed for Italian shores and what happens to them once (or if) they arrive.
## 3946 A visa-investment program in South Dakota is taking up a lot of airtime in the states Senate race.
## 4572 Anemona Hartocollis discusses how she compared the statements of officials with the realities of health care workers dealing with Ebola in New York City.
## 4911 Heres another batch of word mix-ups, both familiar and novel. Some are pure homophone errors, other examples show confusion between similar words with related, but not identical, meanings.
## 4955 Miranda Julys app aims to shake up your life, or at least some of your conversations. Does it deliver?
## 5470 The popular podcast has already spawned online investigations and an extended debate over the ethics of amateurs trying to solve crimes.
## 5625 New research suggests many people dont think that much about brain science. Should they?
## 5834 There may be less chocolate to go around and more people going for it, but there is a silver lining.
## Abstract
## 194 From the emoji diet to Gwyneth Paltrows detox, tales of diet experiments can illuminate how we eat.
## 359 The Op-Ed columnists debate the union of church and real estate in Love Is Strange and explore future dystopias in The Giver. And one columnist pleads: Stop giving actors extra credit for playing gay.
## 640 This summer has given us horrific news story after horrific news story. Does this inevitably lead to compassion fatigue, and if so, how can we fight it?
## 962 Argentina may move its capital from bustling Buenos Aires to a provincial city in the sleepy north. What would this mean for the countrys infamously stormy economy and contentious political climate?
## 1322 The Times is providing updates on the counting of the ballots, which stretched into the early hours of Friday morning.
## 1355 The Apple Watch is coming, but some say less-sophisticated phones are the more sophisticated choice.
## 1685 A new app called Cuddlr helps users locate someone to cuddle with. The idea of cuddling with a total stranger may seem uncomfortable, but it may be more important than we think.
## 1905 The people behind the Living City videos talked about their adventures in filming.
## 2122 Robots can now write us letters. Will we read them?
## 3670 Apps like Lists for Writers aim to help with creativity, but some question whether thats really possible.
## 3783 The FX series has revived the anthology genre, and with subject matter that both fascinates and repulses. But is it a case of genuine creative genius, or running on the fumes of hype?
## 3948 A Vice documentary series highlights the plight of African and Middle Eastern refugees headed for Italian shores and what happens to them once (or if) they arrive.
## 3946 A visa-investment program in South Dakota is taking up a lot of airtime in the states Senate race.
## 4572 Anemona Hartocollis discusses how she compared the statements of officials with the realities of health care workers dealing with Ebola in New York City.
## 4911 Heres another batch of word mix-ups, both familiar and novel. Some are pure homophone errors, other examples show confusion between similar words with related, but not identical, meanings.
## 4955 Miranda Julys app aims to shake up your life, or at least some of your conversations. Does it deliver?
## 5470 The popular podcast has already spawned online investigations and an extended debate over the ethics of amateurs trying to solve crimes.
## 5625 New research suggests many people dont think that much about brain science. Should they?
## 5834 There may be less chocolate to go around and more people going for it, but there is a silver lining.
print("FN_OOB_ids:")
## [1] "FN_OOB_ids:"
print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
## [1] Popular.fctr
## [2] Popular.fctr.predict.Low.cor.X.glm.prob
## [3] Popular.fctr.predict.Low.cor.X.glm
## [4] Popular.fctr.predict.Low.cor.X.glm.accurate
## <0 rows> (or 0-length row.names)
print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
glb_txt_vars])
## [1] Headline Snippet Abstract
## <0 rows> (or 0-length row.names)
print(dsp_vctr <- colSums(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
setdiff(grep("[HSA].", names(glb_OOBobs_df), value=TRUE),
union(myfind_chr_cols_df(glb_OOBobs_df),
grep(".fctr", names(glb_OOBobs_df), fixed=TRUE, value=TRUE)))]))
## PubDate.POSIX H.T.X2014
## 0 0
## H.T.X2015 H.T.art
## 0 0
## H.T.bank H.T.big
## 0 0
## H.T.billion H.T.busi
## 0 0
## H.T.china H.T.daili
## 0 0
## H.T.day H.T.deal
## 0 0
## H.T.fashion H.T.first
## 0 0
## H.T.make H.T.morn
## 0 0
## H.T.new H.T.news
## 0 0
## H.T.newyork H.T.obama
## 0 0
## H.T.pictur H.T.polit
## 0 0
## H.T.report H.T.say
## 0 0
## H.T.springsumm H.T.take
## 0 0
## H.T.test H.T.today
## 0 0
## H.T.week S.T.articl
## 0 0
## S.T.can S.T.compani
## 0 0
## S.T.day S.T.fashion
## 0 0
## S.T.first S.T.intern
## 0 0
## S.T.make S.T.new
## 0 0
## S.T.newyork S.T.newyorktim
## 0 0
## S.T.one S.T.presid
## 0 0
## S.T.report S.T.said
## 0 0
## S.T.share S.T.show
## 0 0
## S.T.take S.T.time
## 0 0
## S.T.week S.T.will
## 0 0
## S.T.year A.T.articl
## 0 0
## A.T.can A.T.compani
## 0 0
## A.T.day A.T.fashion
## 0 0
## A.T.first A.T.intern
## 0 0
## A.T.make A.T.new
## 0 0
## A.T.newyork A.T.newyorktim
## 0 0
## A.T.one A.T.presid
## 0 0
## A.T.report A.T.said
## 0 0
## A.T.share A.T.show
## 0 0
## A.T.take A.T.time
## 0 0
## A.T.week A.T.will
## 0 0
## A.T.year H.T.clip
## 0 0
## H.T.ebola H.T.get
## 0 0
## H.T.newyorktim H.T.read
## 0 0
## H.T.word H.nwrds.log
## 0 0
## H.nwrds.unq.log H.sum.TfIdf
## 0 0
## H.ratio.sum.TfIdf.nwrds H.nchrs.log
## 0 0
## H.nuppr.log H.ndgts.log
## 0 0
## H.npnct01.log H.npnct02.log
## 0 0
## H.npnct03.log H.npnct04.log
## 0 0
## H.npnct05.log H.npnct06.log
## 0 0
## H.npnct07.log H.npnct08.log
## 0 0
## H.npnct09.log H.npnct10.log
## 0 0
## H.npnct11.log H.npnct12.log
## 0 0
## H.npnct13.log H.npnct14.log
## 0 0
## H.npnct15.log H.npnct16.log
## 0 0
## H.npnct17.log H.npnct18.log
## 0 0
## H.npnct19.log H.npnct20.log
## 0 0
## H.npnct21.log H.npnct22.log
## 0 0
## H.npnct23.log H.npnct24.log
## 0 0
## H.npnct25.log H.npnct26.log
## 0 0
## H.npnct27.log H.npnct28.log
## 0 0
## H.npnct29.log H.npnct30.log
## 0 0
## H.nstopwrds.log H.ratio.nstopwrds.nwrds
## 0 0
## H.P.http H.P.year.colon
## 0 0
## H.P.daily.clip.report H.P.fashion.week
## 0 0
## H.P.first.draft H.P.facts.figures
## 0 0
## H.P.friday.night.music H.P.no.comment.colon
## 0 0
## H.P.on.this.day H.P.quandary
## 0 0
## H.P.readers.respond H.P.recap.colon
## 0 0
## H.P.s.notebook H.P.today.in.politic
## 0 0
## H.P.today.in.smallbusiness H.P.verbatim.colon
## 0 0
## H.P.what.we.are S.T.appear
## 0 0
## S.T.archiv S.T.diari
## 0 0
## S.T.herald S.T.obama
## 0 0
## S.T.photo S.T.senat
## 0 0
## S.T.tribun S.T.word
## 0 0
## S.nwrds.log S.nwrds.unq.log
## 0 0
## S.sum.TfIdf S.ratio.sum.TfIdf.nwrds
## 0 0
## S.nchrs.log S.nuppr.log
## 0 0
## S.ndgts.log S.npnct01.log
## 0 0
## S.npnct02.log S.npnct03.log
## 0 0
## S.npnct04.log S.npnct05.log
## 0 0
## S.npnct06.log S.npnct07.log
## 0 0
## S.npnct08.log S.npnct09.log
## 0 0
## S.npnct10.log S.npnct11.log
## 0 0
## S.npnct12.log S.npnct13.log
## 0 0
## S.npnct14.log S.npnct15.log
## 0 0
## S.npnct16.log S.npnct17.log
## 0 0
## S.npnct18.log S.npnct19.log
## 0 0
## S.npnct20.log S.npnct21.log
## 0 0
## S.npnct22.log S.npnct23.log
## 0 0
## S.npnct24.log S.npnct25.log
## 0 0
## S.npnct26.log S.npnct27.log
## 0 0
## S.npnct28.log S.npnct29.log
## 0 0
## S.npnct30.log S.nstopwrds.log
## 0 0
## S.ratio.nstopwrds.nwrds S.P.http
## 0 0
## S.P.year.colon S.P.daily.clip.report
## 0 0
## S.P.fashion.week S.P.first.draft
## 0 0
## S.P.metropolitan.diary.colon A.T.appear
## 0 0
## A.T.archiv A.T.diari
## 0 0
## A.T.herald A.T.obama
## 0 0
## A.T.photo A.T.senat
## 0 0
## A.T.tribun A.T.word
## 0 0
## A.nwrds.log A.nwrds.unq.log
## 0 0
## A.sum.TfIdf A.ratio.sum.TfIdf.nwrds
## 0 0
## A.nchrs.log A.nuppr.log
## 0 0
## A.ndgts.log A.npnct01.log
## 0 0
## A.npnct02.log A.npnct03.log
## 0 0
## A.npnct04.log A.npnct05.log
## 0 0
## A.npnct06.log A.npnct07.log
## 0 0
## A.npnct08.log A.npnct09.log
## 0 0
## A.npnct10.log A.npnct11.log
## 0 0
## A.npnct12.log A.npnct13.log
## 0 0
## A.npnct14.log A.npnct15.log
## 0 0
## A.npnct16.log A.npnct17.log
## 0 0
## A.npnct18.log A.npnct19.log
## 0 0
## A.npnct20.log A.npnct21.log
## 0 0
## A.npnct22.log A.npnct23.log
## 0 0
## A.npnct24.log A.npnct25.log
## 0 0
## A.npnct26.log A.npnct27.log
## 0 0
## A.npnct28.log A.npnct29.log
## 0 0
## A.npnct30.log A.nstopwrds.log
## 0 0
## A.ratio.nstopwrds.nwrds A.P.http
## 0 0
## A.P.year.colon A.P.daily.clip.report
## 0 0
## A.P.fashion.week A.P.first.draft
## 0 0
## A.P.metropolitan.diary.colon
## 0
dsp_hdlpfx_results <- function(hdlpfx) {
print(hdlpfx)
print(glb_OOBobs_df[glb_OOBobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_newobs_df), value=TRUE)])
print(dsp_vctr <- colSums(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
setdiff(grep("[HSA]\\.", names(glb_newobs_df), value=TRUE),
union(myfind_chr_cols_df(glb_newobs_df),
grep(".fctr", names(glb_newobs_df), fixed=TRUE, value=TRUE)))]))
print(dsp_vctr <- dsp_vctr[dsp_vctr != 0])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
union(names(dsp_vctr), myfind_chr_cols_df(glb_newobs_df))])
}
#dsp_hdlpfx_results(hdlpfx="Ask Well::")
# print("myMisc::|OpEd|blank|blank|1:")
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% c(6446),
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# c("WordCount", "WordCount.log", "myMultimedia",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"), ],
# c(glb_rsp_var, "myMultimedia")))
# dsp_chisq.test(Headline.contains="[Vi]deo")
# print(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline")])
# print(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola", Popular=1),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(subset(glb_feats_df, !is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, is.ConditionalX.y & is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
print(subset(glb_feats_df, !is.na(importance))[,
c("zeroVar", "nzv", "myNearZV",
grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
## zeroVar nzv myNearZV importance
## WordCount.log FALSE FALSE FALSE 1.000000e+02
## H.P.readers.respond FALSE TRUE FALSE 5.146747e+01
## myCategory.fctr FALSE FALSE FALSE 4.281250e+01
## H.npnct19.log FALSE FALSE FALSE 4.004695e+01
## H.npnct15.log FALSE FALSE FALSE 3.467378e+01
## .clusterid.fctr FALSE FALSE FALSE 2.932308e+01
## A.npnct13.log FALSE FALSE FALSE 2.871074e+01
## A.npnct19.log FALSE FALSE FALSE 2.850228e+01
## S.nuppr.log FALSE FALSE FALSE 2.501950e+01
## S.T.diari FALSE TRUE FALSE 2.124067e+01
## H.T.word FALSE TRUE FALSE 2.105883e+01
## H.npnct08.log FALSE TRUE FALSE 2.045852e+01
## H.T.read FALSE TRUE FALSE 2.019170e+01
## H.ndgts.log FALSE FALSE FALSE 1.951144e+01
## S.P.metropolitan.diary.colon FALSE TRUE FALSE 1.888990e+01
## S.ratio.sum.TfIdf.nwrds FALSE FALSE FALSE 1.854118e+01
## A.T.newyork FALSE TRUE FALSE 1.825819e+01
## H.nuppr.log FALSE FALSE FALSE 1.817153e+01
## S.T.make FALSE TRUE FALSE 1.723536e+01
## PubDate.wkday.fctr FALSE FALSE FALSE 1.661497e+01
## H.nstopwrds.log FALSE FALSE FALSE 1.627434e+01
## H.ratio.nstopwrds.nwrds FALSE FALSE FALSE 1.623508e+01
## H.npnct11.log FALSE FALSE FALSE 1.582943e+01
## S.T.can FALSE TRUE FALSE 1.557214e+01
## H.P.no.comment.colon FALSE TRUE FALSE 1.552028e+01
## H.P.friday.night.music FALSE TRUE FALSE 1.488096e+01
## A.T.newyorktim FALSE TRUE FALSE 1.471359e+01
## S.npnct04.log FALSE TRUE FALSE 1.357564e+01
## H.T.newyork FALSE TRUE FALSE 1.337672e+01
## S.T.share FALSE TRUE FALSE 1.322366e+01
## S.npnct08.log FALSE TRUE FALSE 1.319408e+01
## H.sum.TfIdf FALSE FALSE FALSE 1.282767e+01
## H.P.recap.colon FALSE TRUE FALSE 1.273820e+01
## S.T.one FALSE TRUE FALSE 1.238107e+01
## H.npnct07.log FALSE FALSE FALSE 1.222449e+01
## PubDate.last10.log FALSE FALSE FALSE 1.193952e+01
## H.T.report FALSE TRUE FALSE 1.191324e+01
## A.nwrds.unq.log FALSE FALSE FALSE 1.190743e+01
## A.T.report FALSE TRUE FALSE 1.155728e+01
## PubDate.hour.fctr FALSE FALSE FALSE 1.144174e+01
## A.T.articl FALSE TRUE FALSE 1.128851e+01
## A.sum.TfIdf FALSE FALSE FALSE 1.110991e+01
## S.nstopwrds.log FALSE FALSE FALSE 1.099737e+01
## PubDate.minute.fctr FALSE FALSE FALSE 1.055593e+01
## H.T.polit FALSE TRUE FALSE 1.032948e+01
## S.ratio.nstopwrds.nwrds FALSE FALSE FALSE 1.026798e+01
## A.T.intern FALSE TRUE FALSE 9.914726e+00
## S.T.time FALSE TRUE FALSE 9.904830e+00
## H.npnct12.log FALSE FALSE FALSE 9.888614e+00
## S.T.take FALSE TRUE FALSE 9.852005e+00
## H.T.art FALSE TRUE FALSE 9.842033e+00
## H.npnct13.log FALSE TRUE FALSE 9.829552e+00
## PubDate.second.fctr FALSE FALSE FALSE 9.794379e+00
## H.T.week FALSE TRUE FALSE 9.718351e+00
## H.T.get FALSE TRUE FALSE 9.576775e+00
## S.npnct01.log FALSE TRUE FALSE 9.564919e+00
## A.T.will FALSE TRUE FALSE 9.491070e+00
## S.T.show FALSE TRUE FALSE 9.342530e+00
## H.T.new FALSE TRUE FALSE 9.203584e+00
## .rnorm FALSE FALSE FALSE 8.941026e+00
## H.ratio.sum.TfIdf.nwrds FALSE FALSE FALSE 8.939183e+00
## S.ndgts.log FALSE FALSE FALSE 8.755923e+00
## H.T.say FALSE TRUE FALSE 8.671813e+00
## A.T.first FALSE TRUE FALSE 8.505413e+00
## A.T.photo FALSE TRUE FALSE 8.036467e+00
## H.T.china FALSE TRUE FALSE 7.799595e+00
## H.npnct01.log FALSE TRUE FALSE 7.787548e+00
## H.T.make FALSE TRUE FALSE 7.674641e+00
## A.T.senat FALSE TRUE FALSE 7.437682e+00
## S.T.said FALSE TRUE FALSE 7.364645e+00
## S.T.day FALSE TRUE FALSE 7.260435e+00
## H.npnct28.log FALSE TRUE FALSE 7.016220e+00
## H.T.news FALSE TRUE FALSE 6.901869e+00
## H.npnct16.log FALSE TRUE FALSE 6.896467e+00
## H.T.take FALSE TRUE FALSE 6.881392e+00
## S.npnct12.log FALSE FALSE FALSE 6.296543e+00
## H.T.busi FALSE TRUE FALSE 5.892173e+00
## A.T.compani FALSE TRUE FALSE 5.701348e+00
## S.npnct11.log FALSE FALSE FALSE 5.458567e+00
## H.T.day FALSE TRUE FALSE 5.341694e+00
## A.T.word FALSE TRUE FALSE 5.341188e+00
## H.P.facts.figures FALSE TRUE FALSE 5.332319e+00
## H.T.X2014 FALSE TRUE FALSE 5.105515e+00
## PubDate.last1.log FALSE FALSE FALSE 5.029197e+00
## S.T.obama FALSE TRUE FALSE 4.950818e+00
## PubDate.date.fctr FALSE FALSE FALSE 4.692266e+00
## H.T.big FALSE TRUE FALSE 4.491165e+00
## S.npnct14.log FALSE TRUE FALSE 4.449833e+00
## A.npnct16.log FALSE TRUE FALSE 4.218694e+00
## S.npnct06.log FALSE TRUE FALSE 3.899006e+00
## S.T.appear FALSE TRUE FALSE 3.840502e+00
## PubDate.last100.log FALSE FALSE FALSE 3.387278e+00
## PubDate.wkend FALSE FALSE FALSE 3.323031e+00
## H.T.ebola FALSE TRUE FALSE 3.198159e+00
## H.nwrds.log FALSE FALSE FALSE 3.088101e+00
## H.T.obama FALSE TRUE FALSE 2.703165e+00
## A.T.year FALSE TRUE FALSE 2.411870e+00
## A.nchrs.log FALSE FALSE FALSE 2.210854e+00
## H.T.test FALSE TRUE FALSE 1.856699e+00
## A.T.week FALSE TRUE FALSE 1.735453e+00
## H.T.pictur FALSE TRUE FALSE 1.728117e+00
## S.nwrds.log FALSE FALSE FALSE 1.511344e+00
## H.T.newyorktim FALSE TRUE FALSE 1.280498e+00
## S.npnct15.log FALSE FALSE FALSE 1.190329e+00
## H.T.bank FALSE TRUE FALSE 1.038507e+00
## H.T.billion FALSE TRUE FALSE 7.999936e-01
## S.T.new FALSE TRUE FALSE 1.565429e-01
## A.T.fashion FALSE TRUE FALSE 1.528913e-01
## H.P.fashion.week FALSE TRUE FALSE 1.109378e-01
## S.T.archiv FALSE TRUE FALSE 9.386317e-02
## S.T.herald FALSE TRUE FALSE 8.612681e-02
## H.T.springsumm FALSE TRUE FALSE 7.774263e-02
## S.T.tribun FALSE TRUE FALSE 7.345077e-02
## H.T.deal FALSE TRUE FALSE 6.047754e-02
## H.P.first.draft FALSE TRUE FALSE 4.572882e-02
## S.npnct28.log FALSE TRUE FALSE 4.463511e-02
## H.P.daily.clip.report FALSE TRUE FALSE 3.416759e-02
## H.P.today.in.smallbusiness FALSE TRUE FALSE 2.839539e-02
## H.P.verbatim.colon FALSE TRUE FALSE 1.634958e-02
## S.P.first.draft FALSE TRUE FALSE 1.547771e-02
## H.npnct02.log FALSE TRUE FALSE 1.526519e-02
## H.P.quandary FALSE TRUE FALSE 1.236942e-02
## S.npnct20.log FALSE TRUE FALSE 1.220902e-02
## S.npnct03.log FALSE TRUE FALSE 1.207385e-02
## A.npnct18.log FALSE TRUE FALSE 1.043696e-02
## A.T.presid FALSE TRUE FALSE 9.440632e-03
## S.T.presid FALSE TRUE FALSE 9.421346e-03
## S.P.year.colon FALSE TRUE FALSE 6.252341e-03
## H.P.on.this.day FALSE TRUE FALSE 5.059061e-03
## H.npnct05.log FALSE TRUE FALSE 4.614811e-03
## S.npnct07.log FALSE TRUE FALSE 2.450155e-03
## S.P.fashion.week FALSE TRUE FALSE 8.838548e-04
## H.P.s.notebook FALSE TRUE FALSE 0.000000e+00
## Low.cor.X.glm.importance Final.glm.importance
## WordCount.log 1.000000e+02 1.000000e+02
## H.P.readers.respond 5.146747e+01 5.146747e+01
## myCategory.fctr 4.281250e+01 4.281250e+01
## H.npnct19.log 4.004695e+01 4.004695e+01
## H.npnct15.log 3.467378e+01 3.467378e+01
## .clusterid.fctr 2.932308e+01 2.932308e+01
## A.npnct13.log 2.871074e+01 2.871074e+01
## A.npnct19.log 2.850228e+01 2.850228e+01
## S.nuppr.log 2.501950e+01 2.501950e+01
## S.T.diari 2.124067e+01 2.124067e+01
## H.T.word 2.105883e+01 2.105883e+01
## H.npnct08.log 2.045852e+01 2.045852e+01
## H.T.read 2.019170e+01 2.019170e+01
## H.ndgts.log 1.951144e+01 1.951144e+01
## S.P.metropolitan.diary.colon 1.888990e+01 1.888990e+01
## S.ratio.sum.TfIdf.nwrds 1.854118e+01 1.854118e+01
## A.T.newyork 1.825819e+01 1.825819e+01
## H.nuppr.log 1.817153e+01 1.817153e+01
## S.T.make 1.723536e+01 1.723536e+01
## PubDate.wkday.fctr 1.661497e+01 1.661497e+01
## H.nstopwrds.log 1.627434e+01 1.627434e+01
## H.ratio.nstopwrds.nwrds 1.623508e+01 1.623508e+01
## H.npnct11.log 1.582943e+01 1.582943e+01
## S.T.can 1.557214e+01 1.557214e+01
## H.P.no.comment.colon 1.552028e+01 1.552028e+01
## H.P.friday.night.music 1.488096e+01 1.488096e+01
## A.T.newyorktim 1.471359e+01 1.471359e+01
## S.npnct04.log 1.357564e+01 1.357564e+01
## H.T.newyork 1.337672e+01 1.337672e+01
## S.T.share 1.322366e+01 1.322366e+01
## S.npnct08.log 1.319408e+01 1.319408e+01
## H.sum.TfIdf 1.282767e+01 1.282767e+01
## H.P.recap.colon 1.273820e+01 1.273820e+01
## S.T.one 1.238107e+01 1.238107e+01
## H.npnct07.log 1.222449e+01 1.222449e+01
## PubDate.last10.log 1.193952e+01 1.193952e+01
## H.T.report 1.191324e+01 1.191324e+01
## A.nwrds.unq.log 1.190743e+01 1.190743e+01
## A.T.report 1.155728e+01 1.155728e+01
## PubDate.hour.fctr 1.144174e+01 1.144174e+01
## A.T.articl 1.128851e+01 1.128851e+01
## A.sum.TfIdf 1.110991e+01 1.110991e+01
## S.nstopwrds.log 1.099737e+01 1.099737e+01
## PubDate.minute.fctr 1.055593e+01 1.055593e+01
## H.T.polit 1.032948e+01 1.032948e+01
## S.ratio.nstopwrds.nwrds 1.026798e+01 1.026798e+01
## A.T.intern 9.914726e+00 9.914726e+00
## S.T.time 9.904830e+00 9.904830e+00
## H.npnct12.log 9.888614e+00 9.888614e+00
## S.T.take 9.852005e+00 9.852005e+00
## H.T.art 9.842033e+00 9.842033e+00
## H.npnct13.log 9.829552e+00 9.829552e+00
## PubDate.second.fctr 9.794379e+00 9.794379e+00
## H.T.week 9.718351e+00 9.718351e+00
## H.T.get 9.576775e+00 9.576775e+00
## S.npnct01.log 9.564919e+00 9.564919e+00
## A.T.will 9.491070e+00 9.491070e+00
## S.T.show 9.342530e+00 9.342530e+00
## H.T.new 9.203584e+00 9.203584e+00
## .rnorm 8.941026e+00 8.941026e+00
## H.ratio.sum.TfIdf.nwrds 8.939183e+00 8.939183e+00
## S.ndgts.log 8.755923e+00 8.755923e+00
## H.T.say 8.671813e+00 8.671813e+00
## A.T.first 8.505413e+00 8.505413e+00
## A.T.photo 8.036467e+00 8.036467e+00
## H.T.china 7.799595e+00 7.799595e+00
## H.npnct01.log 7.787548e+00 7.787548e+00
## H.T.make 7.674641e+00 7.674641e+00
## A.T.senat 7.437682e+00 7.437682e+00
## S.T.said 7.364645e+00 7.364645e+00
## S.T.day 7.260435e+00 7.260435e+00
## H.npnct28.log 7.016220e+00 7.016220e+00
## H.T.news 6.901869e+00 6.901869e+00
## H.npnct16.log 6.896467e+00 6.896467e+00
## H.T.take 6.881392e+00 6.881392e+00
## S.npnct12.log 6.296543e+00 6.296543e+00
## H.T.busi 5.892173e+00 5.892173e+00
## A.T.compani 5.701348e+00 5.701348e+00
## S.npnct11.log 5.458567e+00 5.458567e+00
## H.T.day 5.341694e+00 5.341694e+00
## A.T.word 5.341188e+00 5.341188e+00
## H.P.facts.figures 5.332319e+00 5.332319e+00
## H.T.X2014 5.105515e+00 5.105515e+00
## PubDate.last1.log 5.029197e+00 5.029197e+00
## S.T.obama 4.950818e+00 4.950818e+00
## PubDate.date.fctr 4.692266e+00 4.692266e+00
## H.T.big 4.491165e+00 4.491165e+00
## S.npnct14.log 4.449833e+00 4.449833e+00
## A.npnct16.log 4.218694e+00 4.218694e+00
## S.npnct06.log 3.899006e+00 3.899006e+00
## S.T.appear 3.840502e+00 3.840502e+00
## PubDate.last100.log 3.387278e+00 3.387278e+00
## PubDate.wkend 3.323031e+00 3.323031e+00
## H.T.ebola 3.198159e+00 3.198159e+00
## H.nwrds.log 3.088101e+00 3.088101e+00
## H.T.obama 2.703165e+00 2.703165e+00
## A.T.year 2.411870e+00 2.411870e+00
## A.nchrs.log 2.210854e+00 2.210854e+00
## H.T.test 1.856699e+00 1.856699e+00
## A.T.week 1.735453e+00 1.735453e+00
## H.T.pictur 1.728117e+00 1.728117e+00
## S.nwrds.log 1.511344e+00 1.511344e+00
## H.T.newyorktim 1.280498e+00 1.280498e+00
## S.npnct15.log 1.190329e+00 1.190329e+00
## H.T.bank 1.038507e+00 1.038507e+00
## H.T.billion 7.999936e-01 7.999936e-01
## S.T.new 1.565429e-01 1.565429e-01
## A.T.fashion 1.528913e-01 1.528913e-01
## H.P.fashion.week 1.109378e-01 1.109378e-01
## S.T.archiv 9.386317e-02 9.386317e-02
## S.T.herald 8.612681e-02 8.612681e-02
## H.T.springsumm 7.774263e-02 7.774263e-02
## S.T.tribun 7.345077e-02 7.345077e-02
## H.T.deal 6.047754e-02 6.047754e-02
## H.P.first.draft 4.572882e-02 4.572882e-02
## S.npnct28.log 4.463511e-02 4.463511e-02
## H.P.daily.clip.report 3.416759e-02 3.416759e-02
## H.P.today.in.smallbusiness 2.839539e-02 2.839539e-02
## H.P.verbatim.colon 1.634958e-02 1.634958e-02
## S.P.first.draft 1.547771e-02 1.547771e-02
## H.npnct02.log 1.526519e-02 1.526519e-02
## H.P.quandary 1.236942e-02 1.236942e-02
## S.npnct20.log 1.220902e-02 1.220902e-02
## S.npnct03.log 1.207385e-02 1.207385e-02
## A.npnct18.log 1.043696e-02 1.043696e-02
## A.T.presid 9.440632e-03 9.440632e-03
## S.T.presid 9.421346e-03 9.421346e-03
## S.P.year.colon 6.252341e-03 6.252341e-03
## H.P.on.this.day 5.059061e-03 5.059061e-03
## H.npnct05.log 4.614811e-03 4.614811e-03
## S.npnct07.log 2.450155e-03 2.450155e-03
## S.P.fashion.week 8.838548e-04 8.838548e-04
## H.P.s.notebook 0.000000e+00 0.000000e+00
# print(subset(glb_feats_df, is.na(importance))[,
# c("zeroVar", "nzv", "myNearZV",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "prdnew_dsk.RData"))
rm(submit_df, tmp_OOBent_df)
# tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
# replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
# "data.new.prediction")), flip_coord=TRUE)
# print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
glb_chunks_df <- myadd_chunk(glb_chunks_df, "display.session.info", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 16 predict.data.new 9 0 569.943 578.939 8.996
## 17 display.session.info 10 0 578.939 NA NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 42.469 213.686 171.217
## 8 select.features 5 0 235.859 370.726 134.867
## 11 fit.models 7 1 425.869 489.069 63.200
## 10 fit.models 7 0 372.049 425.868 53.820
## 14 fit.data.training 8 0 514.310 558.729 44.419
## 7 cluster.data 4 0 213.686 235.859 22.173
## 2 inspect.data 2 0 14.258 33.040 18.782
## 12 fit.models 7 2 489.070 505.385 16.315
## 15 fit.data.training 8 1 558.729 569.943 11.214
## 16 predict.data.new 9 0 569.943 578.939 8.996
## 13 fit.models 7 3 505.386 514.309 8.923
## 4 manage.missing.data 2 2 36.983 42.439 5.456
## 3 cleanse.data 2 1 33.040 36.982 3.942
## 9 partition.data.training 6 0 370.726 372.048 1.322
## 1 import.data 1 0 13.224 14.257 1.033
## 5 encode.data 2 3 42.440 42.468 0.028
## duration
## 6 171.217
## 8 134.867
## 11 63.200
## 10 53.819
## 14 44.419
## 7 22.173
## 2 18.782
## 12 16.315
## 15 11.214
## 16 8.996
## 13 8.923
## 4 5.456
## 3 3.942
## 9 1.322
## 1 1.033
## 5 0.028
## [1] "Total Elapsed Time: 578.939 secs"
## label step_major step_minor bgn end elapsed
## 2 fit.models_1_glm 2 0 430.059 466.393 36.334
## 3 fit.models_1_rpart 3 0 466.394 489.062 22.668
## 1 fit.models_1_bgn 1 0 430.046 430.059 0.013
## duration
## 2 36.334
## 3 22.668
## 1 0.013
## [1] "Total Elapsed Time: 489.062 secs"
## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] tcltk grid parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] rpart.plot_1.5.2 rpart_4.1-9 ROCR_1.0-7
## [4] gplots_2.17.0 caTools_1.17.1 caret_6.0-47
## [7] dynamicTreeCut_1.62 proxy_0.4-14 tm_0.6
## [10] NLP_0.1-7 stringr_1.0.0 mice_2.22
## [13] lattice_0.20-31 Rcpp_0.11.6 plyr_1.8.2
## [16] zoo_1.7-12 sqldf_0.4-10 RSQLite_1.0.0
## [19] DBI_0.3.1 gsubfn_0.6-6 proto_0.3-10
## [22] reshape2_1.4.1 doMC_1.3.3 iterators_1.0.7
## [25] foreach_1.4.2 doBy_4.5-13 survival_2.38-1
## [28] ggplot2_1.0.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 BradleyTerry2_1.0-6 brglm_0.5-9
## [4] car_2.0-25 chron_2.3-45 class_7.3-12
## [7] codetools_0.2-11 colorspace_1.2-6 compiler_3.1.3
## [10] digest_0.6.8 e1071_1.6-4 evaluate_0.7
## [13] formatR_1.2 gdata_2.16.1 gtable_0.1.2
## [16] gtools_3.4.2 htmltools_0.2.6 KernSmooth_2.23-14
## [19] knitr_1.10.5 labeling_0.3 lme4_1.1-7
## [22] magrittr_1.5 MASS_7.3-40 Matrix_1.2-0
## [25] mgcv_1.8-6 minqa_1.2.4 munsell_0.4.2
## [28] nlme_3.1-120 nloptr_1.0.4 nnet_7.3-9
## [31] pbkrtest_0.4-2 quantreg_5.11 randomForest_4.6-10
## [34] RColorBrewer_1.1-2 rmarkdown_0.5.1 scales_0.2.4
## [37] slam_0.1-32 SparseM_1.6 splines_3.1.3
## [40] stringi_0.4-1 tools_3.1.3 yaml_2.1.13